diff --git a/.gitignore b/.gitignore index 8b11f57..7d5b309 100644 --- a/.gitignore +++ b/.gitignore @@ -12,4 +12,5 @@ dist *.build *.dist *.egg-info -*.cpython-312.pyc \ No newline at end of file +*.cpython-312.pyc +example-socket-export.py \ No newline at end of file diff --git a/socketdev/dependencies/__init__.py b/socketdev/dependencies/__init__.py index 201a9e0..c55e9f7 100644 --- a/socketdev/dependencies/__init__.py +++ b/socketdev/dependencies/__init__.py @@ -2,6 +2,7 @@ from urllib.parse import urlencode import logging from socketdev.tools import load_files +from ..utils import Utils log = logging.getLogger("socketdev") @@ -12,9 +13,13 @@ class Dependencies: def __init__(self, api): self.api = api - def post(self, files: list, params: dict) -> dict: - loaded_files = [] - loaded_files = load_files(files, loaded_files) + def post(self, files: list, params: dict, use_lazy_loading: bool = False, workspace: str = None) -> dict: + if use_lazy_loading: + loaded_files = Utils.load_files_for_sending_lazy(files, workspace) + else: + loaded_files = [] + loaded_files = load_files(files, loaded_files) + path = "dependencies/upload?" + urlencode(params) response = self.api.do_request(path=path, files=loaded_files, method="POST") if response.status_code == 200: diff --git a/socketdev/diffscans/__init__.py b/socketdev/diffscans/__init__.py index 86ea575..5febca0 100644 --- a/socketdev/diffscans/__init__.py +++ b/socketdev/diffscans/__init__.py @@ -1,6 +1,7 @@ import json import logging from typing import Any, Dict, Optional, Union +from ..utils import Utils log = logging.getLogger("socketdev") @@ -29,13 +30,44 @@ def get(self, org_slug: str, diff_scan_id: str) -> dict: log.error(f"Error fetching diff scan: {response.status_code}, message: {response.text}") return {} - def create_from_repo(self, org_slug: str, repo_slug: str, files: list, params: Optional[Dict[str, Any]] = None) -> dict: - """Create a diff scan from repo HEAD, uploading files as multipart form data.""" + def create_from_repo(self, org_slug: str, repo_slug: str, files: list, params: Optional[Dict[str, Any]] = None, use_lazy_loading: bool = False, workspace: str = None, max_open_files: int = 100) -> dict: + """ + Create a diff scan from repo HEAD, uploading files as multipart form data. + + Args: + org_slug: Organization slug + repo_slug: Repository slug + files: List of file paths to upload for scanning + params: Optional query parameters for the request + use_lazy_loading: Whether to use lazy file loading to prevent "too many open files" + errors when uploading large numbers of files (default: False) + NOTE: In version 3.0, this will default to True for better performance + workspace: Base directory path to make file paths relative to + max_open_files: Maximum number of files to keep open simultaneously when using + lazy loading. Useful for systems with low ulimit values (default: 100) + + Returns: + dict: API response containing diff scan results + + Note: + When use_lazy_loading=True, files are opened only when needed during upload, + preventing file descriptor exhaustion. The max_open_files parameter controls how many + files can be open simultaneously - set this lower on systems with restrictive ulimits. + + For large file uploads (>100 files), it's recommended to set use_lazy_loading=True. + """ import urllib.parse path = f"orgs/{org_slug}/diff-scans/from-repo/{repo_slug}" if params: path += "?" + urllib.parse.urlencode(params) - response = self.api.do_request(path=path, method="POST", files=files) + + # Use lazy loading if requested + if use_lazy_loading: + prepared_files = Utils.load_files_for_sending_lazy(files, workspace, max_open_files) + else: + prepared_files = files + + response = self.api.do_request(path=path, method="POST", files=prepared_files) if response.status_code in (200, 201): return response.json() log.error(f"Error creating diff scan from repo: {response.status_code}, message: {response.text}") diff --git a/socketdev/fullscans/__init__.py b/socketdev/fullscans/__init__.py index 1f00f64..b543e4e 100644 --- a/socketdev/fullscans/__init__.py +++ b/socketdev/fullscans/__init__.py @@ -343,11 +343,9 @@ def from_dict(cls, data: dict) -> "LicenseMatch": @dataclass class LicenseDetail: authors: List[str] - charEnd: int - charStart: int + errorData: str filepath: str match_strength: int - filehash: str provenance: str spdxDisj: List[List[LicenseMatch]] @@ -360,14 +358,13 @@ def to_dict(self): @classmethod def from_dict(cls, data: dict) -> "LicenseDetail": return cls( + spdxDisj=data["spdxDisj"], authors=data["authors"], - charEnd=data["charEnd"], - charStart=data["charStart"], + errorData=data["errorData"], + provenance=data["provenance"], filepath=data["filepath"], match_strength=data["match_strength"], - filehash=data["filehash"], - provenance=data["provenance"], - spdxDisj=[[LicenseMatch.from_dict(match) for match in group] for group in data["spdxDisj"]], + ) @@ -723,7 +720,31 @@ def get(self, org_slug: str, params: dict, use_types: bool = False) -> Union[dic ) return {} - def post(self, files: list, params: FullScanParams, use_types: bool = False) -> Union[dict, CreateFullScanResponse]: + def post(self, files: list, params: FullScanParams, use_types: bool = False, use_lazy_loading: bool = False, workspace: str = None, max_open_files: int = 100) -> Union[dict, CreateFullScanResponse]: + """ + Create a new full scan by uploading manifest files. + + Args: + files: List of file paths to upload for scanning + params: FullScanParams object containing scan configuration + use_types: Whether to return typed response objects (default: False) + use_lazy_loading: Whether to use lazy file loading to prevent "too many open files" + errors when uploading large numbers of files (default: False) + NOTE: In version 3.0, this will default to True for better performance + workspace: Base directory path to make file paths relative to + max_open_files: Maximum number of files to keep open simultaneously when using + lazy loading. Useful for systems with low ulimit values (default: 100) + + Returns: + dict or CreateFullScanResponse: API response containing scan results + + Note: + When use_lazy_loading=True, files are opened only when needed during upload, + preventing file descriptor exhaustion. The max_open_files parameter controls how many + files can be open simultaneously - set this lower on systems with restrictive ulimits. + + For large file uploads (>100 files), it's recommended to set use_lazy_loading=True. + """ Utils.validate_integration_type(params.integration_type if params.integration_type else "api") org_slug = str(params.org_slug) params_dict = params.to_dict() @@ -731,7 +752,13 @@ def post(self, files: list, params: FullScanParams, use_types: bool = False) -> params_arg = urllib.parse.urlencode(params_dict) path = "orgs/" + org_slug + "/full-scans?" + str(params_arg) - response = self.api.do_request(path=path, method="POST", files=files) + # Use lazy loading if requested + if use_lazy_loading: + prepared_files = Utils.load_files_for_sending_lazy(files, workspace, max_open_files) + else: + prepared_files = files + + response = self.api.do_request(path=path, method="POST", files=prepared_files) if response.status_code == 201: result = response.json() @@ -766,10 +793,10 @@ def stream_diff( before: str, after: str, use_types: bool = True, - include_license_details: bool = False, + include_license_details: str = "true", **kwargs, ) -> Union[dict, StreamDiffResponse]: - path = f"orgs/{org_slug}/full-scans/diff?before={before}&after={after}&{include_license_details}" + path = f"orgs/{org_slug}/full-scans/diff?before={before}&after={after}&include_license_details={include_license_details}" if kwargs: for key, value in kwargs.items(): path += f"&{key}={value}" diff --git a/socketdev/utils/__init__.py b/socketdev/utils/__init__.py index dd90b16..bae4e42 100644 --- a/socketdev/utils/__init__.py +++ b/socketdev/utils/__init__.py @@ -1,12 +1,284 @@ -from typing import Literal +from typing import Literal, List, Tuple +import logging +import os +import weakref +from threading import Lock + +log = logging.getLogger("socketdev") IntegrationType = Literal["api", "github", "gitlab", "bitbucket", "azure"] INTEGRATION_TYPES = ("api", "github", "gitlab", "bitbucket", "azure") +class FileDescriptorManager: + """ + Global manager to track and limit the number of open file descriptors. + Automatically closes least recently used files when limit is reached. + """ + _instance = None + _lock = Lock() + + def __new__(cls): + if cls._instance is None: + with cls._lock: + if cls._instance is None: + cls._instance = super().__new__(cls) + cls._instance._initialized = False + return cls._instance + + def __init__(self): + if not self._initialized: + self.max_open_files = 100 # Default limit, can be overridden + self.open_files = [] # List of weakrefs to LazyFileLoader instances + self._initialized = True + log.debug(f"FileDescriptorManager initialized with default max_open_files={self.max_open_files}") + + def set_max_open_files(self, max_files: int): + """Set the maximum number of open files.""" + with self._lock: + self.max_open_files = max_files + log.debug(f"FileDescriptorManager max_open_files set to {self.max_open_files}") + + # If we're now over the limit, close some files + while len(self.open_files) >= self.max_open_files: + self.open_files = [ref for ref in self.open_files if ref() is not None] + if len(self.open_files) >= self.max_open_files and self.open_files: + oldest_ref = self.open_files.pop(0) + oldest_file = oldest_ref() + if oldest_file is not None and oldest_file._file is not None: + oldest_file.close() + log.debug(f"Auto-closed file due to new descriptor limit: {oldest_file.file_path}") + else: + break + + def register_file_open(self, lazy_file_loader): + """Register a file as opened and manage the descriptor limit.""" + with self._lock: + # Remove any dead weak references + self.open_files = [ref for ref in self.open_files if ref() is not None] + + # If we're at the limit, close the oldest file + if len(self.open_files) >= self.max_open_files: + oldest_ref = self.open_files.pop(0) + oldest_file = oldest_ref() + if oldest_file is not None and oldest_file._file is not None: + oldest_file.close() + log.debug(f"Auto-closed file due to descriptor limit: {oldest_file.file_path}") + + # Add the new file to the end of the list + self.open_files.append(weakref.ref(lazy_file_loader)) + + def unregister_file(self, lazy_file_loader): + """Remove a file from the tracking list when it's closed.""" + with self._lock: + self.open_files = [ref for ref in self.open_files + if ref() is not None and ref() is not lazy_file_loader] + + +# Global instance +_fd_manager = FileDescriptorManager() + + +class LazyFileLoader: + """ + A file-like object that only opens the actual file when needed for reading. + This prevents keeping too many file descriptors open simultaneously. + + This class implements the standard file-like interface that requests library + expects for multipart uploads, making it a drop-in replacement for regular + file objects. + """ + + def __init__(self, file_path: str, name: str): + self.file_path = file_path + self.name = name + self._file = None + self._closed = False + self._position = 0 + self._size = None + + def _ensure_open(self): + """Ensure the file is open and seek to the correct position.""" + if self._closed: + raise ValueError("I/O operation on closed file.") + + if self._file is None: + try: + self._file = open(self.file_path, 'rb') + _fd_manager.register_file_open(self) + log.debug(f"Opened file for reading: {self.file_path}") + # Seek to the current position if we've been reading before + if self._position > 0: + self._file.seek(self._position) + except OSError as e: + if e.errno == 24: # Too many open files + # Try to force garbage collection to close unused files + import gc + gc.collect() + # Retry once + self._file = open(self.file_path, 'rb') + _fd_manager.register_file_open(self) + log.debug(f"Opened file for reading (after gc): {self.file_path}") + if self._position > 0: + self._file.seek(self._position) + else: + raise + + def _get_size(self): + """Get file size without keeping file open.""" + if self._size is None: + self._size = os.path.getsize(self.file_path) + return self._size + + def read(self, size: int = -1): + """Read from the file, opening it if needed.""" + self._ensure_open() + data = self._file.read(size) + self._position = self._file.tell() + + # If we've read the entire file, close it to free the file descriptor + if size == -1 or len(data) < size: + self.close() + + return data + + def readline(self, size: int = -1): + """Read a line from the file.""" + self._ensure_open() + data = self._file.readline(size) + self._position = self._file.tell() + return data + + def seek(self, offset: int, whence: int = 0): + """Seek to a position in the file.""" + if self._closed: + raise ValueError("I/O operation on closed file.") + + # Calculate new position for tracking + if whence == 0: # SEEK_SET + self._position = offset + elif whence == 1: # SEEK_CUR + self._position += offset + elif whence == 2: # SEEK_END + # We need to open the file to get its size + self._ensure_open() + result = self._file.seek(offset, whence) + self._position = self._file.tell() + return result + + # If file is already open, seek it too + if self._file is not None: + result = self._file.seek(self._position) + return result + + return self._position + + def tell(self): + """Return current file position.""" + if self._closed: + raise ValueError("I/O operation on closed file.") + + if self._file is not None: + self._position = self._file.tell() + + return self._position + + def close(self): + """Close the file if it was opened.""" + if self._file is not None: + self._file.close() + log.debug(f"Closed file: {self.file_path}") + self._file = None + _fd_manager.unregister_file(self) + self._closed = True + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.close() + + def __len__(self): + """Return file size. Requests library uses this for Content-Length.""" + return self._get_size() + + @property + def closed(self): + """Check if the file is closed.""" + return self._closed + + @property + def mode(self): + """Return the file mode.""" + return 'rb' + + def readable(self): + """Return whether the file is readable.""" + return not self._closed + + def writable(self): + """Return whether the file is writable.""" + return False + + def seekable(self): + """Return whether the file supports seeking.""" + return True + + class Utils: @staticmethod def validate_integration_type(integration_type: str) -> IntegrationType: if integration_type not in INTEGRATION_TYPES: raise ValueError(f"Invalid integration type: {integration_type}") return integration_type # type: ignore + + @staticmethod + def load_files_for_sending_lazy(files: List[str], workspace: str = None, max_open_files: int = 100) -> List[Tuple[str, Tuple[str, LazyFileLoader]]]: + """ + Prepares files for sending to the Socket API using lazy loading. + + This version doesn't open all files immediately, instead it creates + LazyFileLoader objects that only open files when they're actually read. + This prevents "Too many open files" errors when dealing with large numbers + of manifest files. + + Args: + files: List of file paths from find_files() + workspace: Base directory path to make paths relative to + max_open_files: Maximum number of files to keep open simultaneously (default: 100) + + Returns: + List of tuples formatted for requests multipart upload: + [(field_name, (filename, lazy_file_object)), ...] + """ + # Configure the file descriptor manager with the specified limit + _fd_manager.set_max_open_files(max_open_files) + + send_files = [] + if workspace and "\\" in workspace: + workspace = workspace.replace("\\", "/") + + for file_path in files: + # Normalize file path + if "\\" in file_path: + file_path = file_path.replace("\\", "/") + + _, name = file_path.rsplit("/", 1) + + # Calculate the key (relative path from workspace) + if workspace and file_path.startswith(workspace): + key = file_path[len(workspace):] + else: + key = file_path + + key = key.lstrip("/") + key = key.lstrip("./") + + # Create lazy file loader instead of opening file immediately + # Use the relative path (key) as filename instead of truncated basename + lazy_file = LazyFileLoader(file_path, key) + payload = (key, (key, lazy_file)) + send_files.append(payload) + + log.debug(f"Prepared {len(send_files)} files for lazy loading") + return send_files diff --git a/socketdev/version.py b/socketdev/version.py index 0b167e6..c377db3 100644 --- a/socketdev/version.py +++ b/socketdev/version.py @@ -1 +1 @@ -__version__ = "2.1.5" +__version__ = "2.1.8" diff --git a/tests/test_diffscans.py b/tests/test_diffscans.py deleted file mode 100644 index 720b060..0000000 --- a/tests/test_diffscans.py +++ /dev/null @@ -1,60 +0,0 @@ -import unittest -from unittest.mock import MagicMock -from socketdev.diffscans import DiffScans - -class TestDiffScans(unittest.TestCase): - def setUp(self): - self.api = MagicMock() - self.diffscans = DiffScans(self.api) - self.org_slug = "test-org" - self.diff_scan_id = "test-diff-scan-id" - self.repo_slug = "test-repo" - - def test_list(self): - self.api.do_request.return_value.status_code = 200 - self.api.do_request.return_value.json.return_value = {"results": []} - result = self.diffscans.list(self.org_slug) - self.assertIn("results", result) - self.api.do_request.assert_called_with(path=f"orgs/{self.org_slug}/diff-scans", method="GET") - - def test_get(self): - self.api.do_request.return_value.status_code = 200 - # Simulate new API response structure - self.api.do_request.return_value.json.return_value = {"diff_scan": {"id": self.diff_scan_id}} - result = self.diffscans.get(self.org_slug, self.diff_scan_id) - self.assertIn("diff_scan", result) - self.assertEqual(result["diff_scan"]["id"], self.diff_scan_id) - self.api.do_request.assert_called_with(path=f"orgs/{self.org_slug}/diff-scans/{self.diff_scan_id}", method="GET") - - def test_create_from_repo(self): - self.api.do_request.return_value.status_code = 201 - self.api.do_request.return_value.json.return_value = {"created": True} - body = {"foo": "bar"} - result = self.diffscans.create_from_repo(self.org_slug, self.repo_slug, body) - self.assertTrue(result["created"]) - self.api.do_request.assert_called_with(path=f"orgs/{self.org_slug}/diff-scans/from-repo/{self.repo_slug}", method="POST", json=body) - - def test_create_from_ids(self): - self.api.do_request.return_value.status_code = 201 - self.api.do_request.return_value.json.return_value = {"created": True} - body = {"before": "id1", "after": "id2"} - result = self.diffscans.create_from_ids(self.org_slug, body) - self.assertTrue(result["created"]) - self.api.do_request.assert_called_with(path=f"orgs/{self.org_slug}/diff-scans/from-ids", method="POST", json=body) - - def test_gfm(self): - self.api.do_request.return_value.status_code = 200 - self.api.do_request.return_value.json.return_value = {"gfm": "markdown"} - result = self.diffscans.gfm(self.org_slug, self.diff_scan_id) - self.assertIn("gfm", result) - self.api.do_request.assert_called_with(path=f"orgs/{self.org_slug}/diff-scans/{self.diff_scan_id}/gfm", method="GET") - - def test_delete(self): - self.api.do_request.return_value.status_code = 200 - self.api.do_request.return_value.json.return_value = {"deleted": True} - result = self.diffscans.delete(self.org_slug, self.diff_scan_id) - self.assertTrue(result["deleted"]) - self.api.do_request.assert_called_with(path=f"orgs/{self.org_slug}/diff-scans/{self.diff_scan_id}", method="DELETE") - -if __name__ == "__main__": - unittest.main()