diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a02fbf0 --- /dev/null +++ b/.gitignore @@ -0,0 +1,12 @@ +# local helpers and temp files +.local +.workspace +.aider* +.env +.evidence +__pycache__ +*.jsonl +node_modules +venv +output +.token diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..1d7625b --- /dev/null +++ b/Dockerfile @@ -0,0 +1,19 @@ +FROM python:3.13-slim + +WORKDIR /app + +COPY requirements.txt . + +# Run as non-root user +RUN useradd --create-home --shell /bin/bash appuser \ + && mkdir -p output \ + && chown -R appuser:appuser output + +USER appuser + +RUN pip install --no-cache-dir -r requirements.txt + +COPY src/ ./src/ +COPY config/ ./config/ + +ENTRYPOINT ["python", "./src/metrics_check.py"] diff --git a/README.md b/README.md index 7bf9b2f..02e56b0 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,221 @@ +# GitHub Repository Metrics Collector + +This project collects various metrics from GitHub repositories using the GitHub GraphQL API and outputs them as JSONL files. It's designed to help analyze repository activity, contributors, issues, and other key metrics. + +## Features + +- Collects information on markdown files at repository root (e.g. README.md, LICENSE.md) +- Retrieves repository license information +- Gathers list of releases, with timestamps +- Tracks contributors and their most recent contribution dates +- Collects commit history +- Records issue information including creators and status + +## Prerequisites + +- Python 3 +- A GitHub personal access token with appropriate repository read permissions + +## Installation + +1. Clone the repository: + ```bash + git clone + cd + ``` + +2. Install the required dependencies: + ```bash + pip install -r requirements.txt + ``` + +## Configuration + +The program can be configured using both a YAML configuration file and environment variables. Environment variables take precedence over the configuration file. + +### Configuration File (config/config.yaml) + +The default configuration file is located at `config/config.yaml`: + +```yaml +repo: + owner: "duckdb" # Repository owner/organization + name: "duckdb-wasm" # Repository name + +output: + directory: "output" # Output directory for JSONL files + root_md_files: "root_md_files.jsonl" + license: "license.jsonl" + releases: "releases.jsonl" + contributors: "contributors.jsonl" + commits: "commits.jsonl" + issues: "issues.jsonl" + +github_api_url: "https://api.github.com/graphql" +pagination_limit: 100 # Number of items per API request +date_range_days: 365 # How far back to collect data (in days) +request_timeout: 30 # API request timeout in seconds +``` + +### Environment Variables + +You can override configuration values using environment variables: + +- `GITHUB_TOKEN`: (Required) Your GitHub personal access token +- `GITHUB_TOKEN_FILE`: (Alternative to `GITHUB_TOKEN`) Path to a file containing your GitHub personal access token +- `CONFIG_FILE`: Path to custom configuration file (optional) +- `REPO_OWNER`: Override repository owner from config +- `REPO_NAME`: Override repository name from config + +Note: You must provide either `GITHUB_TOKEN` or `GITHUB_TOKEN_FILE`, but not both. + +To create a GitHub personal access token: + +1. Go to GitHub Settings > Developer settings > Personal access tokens +2. Generate a new token with repo scope +3. Copy the token for use with this application + +To use a token file: +1. Create a file containing only your GitHub token (no extra characters or newlines) +2. Set the `GITHUB_TOKEN_FILE` environment variable to the path of this file + +## Running the Program + +### Running with Python + +1. Set your GitHub token as an environment variable: + ```bash + export GITHUB_TOKEN=your_github_token_here + ``` + + OR set the path to a token file: + ```bash + export GITHUB_TOKEN_FILE=/path/to/your/token/file + ``` + +2. Run the metrics collection script: + ```bash + python src/metrics_check.py + ``` + +3. To use a custom configuration file: + ```bash + export CONFIG_FILE=path/to/your/config.yaml + python src/metrics_check.py + ``` + +4. To override repository owner/name: + ```bash + export REPO_OWNER=your_owner + export REPO_NAME=your_repo + python src/metrics_check.py + ``` + +### Running with Docker + +You can run the application using the pre-built Docker image: + +1. Pull the image: + ```bash + docker pull codeberg.org/0xf1e/project-health-analyzer:latest + ``` + +2. Run the container with your GitHub token: + ```bash + docker run --rm \ + -e GITHUB_TOKEN=your_github_token_here \ + -v $(pwd)/output:/app/output \ + codeberg.org/0xf1e/project-health-analyzer:latest + ``` + +3. To use a token file: + ```bash + docker run --rm \ + -e GITHUB_TOKEN_FILE=/app/token.txt \ + -v /path/to/your/token/file:/app/token.txt \ + -v $(pwd)/output:/app/output \ + codeberg.org/0xf1e/project-health-analyzer:latest + ``` + +4. To use a custom configuration file: + ```bash + docker run --rm \ + -e GITHUB_TOKEN=your_github_token_here \ + -v /path/to/your/config.yaml:/app/config/config.yaml \ + -v $(pwd)/output:/app/output \ + codeberg.org/0xf1e/project-health-analyzer:latest + ``` + +5. To override repository owner/name: + ```bash + docker run --rm \ + -e GITHUB_TOKEN=your_github_token_here \ + -e REPO_OWNER=your_owner \ + -e REPO_NAME=your_repo \ + -v $(pwd)/output:/app/output \ + codeberg.org/0xf1e/project-health-analyzer:latest + ``` + +## Output Files + +All output files are saved in JSONL format (JSON Lines), with one JSON object per line. By default, files are saved to the `output/` directory. + +### root_md_files.jsonl + +Contains names of all markdown files in the repository root: + +```json +{"file": "README.md"} +{"file": "CONTRIBUTING.md"} +``` + +### license.jsonl + +Contains the repository license information: + +```json +{"license": "MIT License"} +``` + +### releases.jsonl + +Contains release information with timestamps: + +```json +{"name": "v1.0.0", "publishedAt": "2023-01-15T10:30:00Z"} +``` + +### contributors.jsonl + +Contains contributors with their most recent contribution date: + +```json +{"login": "username", "last_contribution": "2023-05-20T14:22:30Z"} +``` + +### commits.jsonl + +Contains commit information: + +```json +{"message": "Fix bug in parser", "date": "2023-05-19T09:15:00Z", "author": "Developer Name"} +``` + +### issues.jsonl + +Contains issue information: + +```json +{"title": "Bug in authentication", "state": "CLOSED", "author": "user123", "createdAt": "2023-04-10T16:45:00Z"} +``` + +## Customization + +You can modify the date range for data collection by changing the `date_range_days` value in the configuration. The default is 365 days (1 year). + +The pagination limit can also be adjusted with `pagination_limit` to control how many items are fetched per API request. +--- + # 🚀 Health Analyzer PoC > Reducing Risk in Open Source Adoption diff --git a/config/config.yaml b/config/config.yaml new file mode 100644 index 0000000..bd67ba3 --- /dev/null +++ b/config/config.yaml @@ -0,0 +1,17 @@ +repo: + owner: "duckdb" + name: "duckdb-wasm" + +output: + directory: "output" + root_md_files: "root_md_files.jsonl" + license: "license.jsonl" + releases: "releases.jsonl" + contributors: "contributors.jsonl" + commits: "commits.jsonl" + issues: "issues.jsonl" + +github_api_url: "https://api.github.com/graphql" +pagination_limit: 100 +date_range_days: 365 +request_timeout: 30 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..0808481 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +requests==2.32.5 +PyYAML==6.0.2 diff --git a/src/config.py b/src/config.py new file mode 100644 index 0000000..b0d045c --- /dev/null +++ b/src/config.py @@ -0,0 +1,109 @@ +#!/usr/bin/env python3 + +import yaml +import sys +import logging +import os +from typing import Optional + +logger = logging.getLogger(__name__) + +class Configuration: + def __init__(self, config_file: Optional[str] = None): + # Get config file path from environment variable or use default + self.config_file: str = config_file or os.environ.get('CONFIG_FILE', 'config/config.yaml') + + # Initialize default values + self.owner: Optional[str] = None + self.repo_name: Optional[str] = None + self.output_dir: str = '.' + self.root_md_files_output: str = 'root_md_files.jsonl' + self.license_output: str = 'license.jsonl' + self.releases_output: str = 'releases.jsonl' + self.contributors_output: str = 'contributors.jsonl' + self.commits_output: str = 'commits.jsonl' + self.issues_output: str = 'issues.jsonl' + + # GitHub API configuration + self.github_api_url: str = 'https://api.github.com/graphql' + self.github_token: Optional[str] = None + self.pagination_limit: int = 100 + self.date_range_days: int = 365 + self.request_timeout: int = 30 + + self._load_config(self.config_file) + + # Override with environment variables if set + self.owner = os.environ.get('REPO_OWNER') or self.owner + self.repo_name = os.environ.get('REPO_NAME') or self.repo_name + self.github_token = self._get_github_token() + + def _get_github_token(self) -> Optional[str]: + """Get GitHub token from environment variables or config file""" + github_token = self.github_token # From config file + + # Check for GITHUB_TOKEN environment variable + env_token = os.environ.get('GITHUB_TOKEN') + + # Check for GITHUB_TOKEN_FILE environment variable + token_file_path = os.environ.get('GITHUB_TOKEN_FILE') + + # Validate that only one token source is used + if env_token and token_file_path: + logger.error("Both GITHUB_TOKEN and GITHUB_TOKEN_FILE environment variables are set. Please use only one.") + sys.exit(1) + + # Priority: GITHUB_TOKEN env var > GITHUB_TOKEN_FILE env var > config file + if env_token: + return env_token + elif token_file_path: + try: + with open(token_file_path, 'r') as f: + token = f.read().strip() + if not token: + logger.error(f"Token file {token_file_path} is empty.") + sys.exit(1) + return token + except FileNotFoundError: + logger.error(f"Token file {token_file_path} not found.") + sys.exit(1) + except Exception as e: + logger.error(f"Error reading token file {token_file_path}: {e}") + sys.exit(1) + + return github_token + + def _load_config(self, config_file: str) -> None: + """Load configuration from YAML file""" + try: + with open(config_file, 'r') as f: + config = yaml.safe_load(f) + + # Load repository configuration + repo_config = config.get('repo', {}) + self.owner = repo_config.get('owner') + self.repo_name = repo_config.get('name') + + # Load output configuration + output_config = config.get('output', {}) + self.output_dir = output_config.get('directory', '.') + self.root_md_files_output = output_config.get('root_md_files', 'root_md_files.jsonl') + self.license_output = output_config.get('license', 'license.jsonl') + self.releases_output = output_config.get('releases', 'releases.jsonl') + self.contributors_output = output_config.get('contributors', 'contributors.jsonl') + self.commits_output = output_config.get('commits', 'commits.jsonl') + self.issues_output = output_config.get('issues', 'issues.jsonl') + + # Load GitHub API configuration + self.github_api_url = config.get('github_api_url', 'https://api.github.com/graphql') + self.github_token = config.get('github_token') # Allow token in config file + self.pagination_limit = config.get('pagination_limit', 100) + self.date_range_days = config.get('date_range_days', 365) + self.request_timeout = config.get('request_timeout', 30) + + except FileNotFoundError: + logger.error(f"{config_file} file not found.") + sys.exit(1) + except Exception as e: + logger.error(f"Error parsing {config_file}: {e}") + sys.exit(1) diff --git a/src/file_writer.py b/src/file_writer.py new file mode 100644 index 0000000..c2ece4f --- /dev/null +++ b/src/file_writer.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python3 + +import json +import logging +import os +from typing import Dict, List +from config import Configuration + +logger = logging.getLogger(__name__) + +class FileWriter: + def __init__(self, config: Configuration): + self.config = config + + def _get_output_path(self, filename: str) -> str: + """Get full path for output file""" + return os.path.join(self.config.output_dir, filename) + + def write_root_md_files(self, md_files: List[str]) -> None: + """Write the .md files in the root folder as JSONL""" + output_file = self._get_output_path(self.config.root_md_files_output) + logger.info(f"Writing {len(md_files)} .md files to {output_file}") + with open(output_file, 'w') as f: + for file in md_files: + f.write(json.dumps({"file": file}) + '\n') + + def write_license(self, license_name: str) -> None: + """Write the repository license name as JSONL""" + output_file = self._get_output_path(self.config.license_output) + logger.info(f"Writing license to {output_file}: {license_name}") + with open(output_file, 'w') as f: + f.write(json.dumps({"license": license_name}) + '\n') + + def write_releases(self, releases: List[Dict[str, str]]) -> None: + """Write all releases with timestamps as JSONL""" + output_file = self._get_output_path(self.config.releases_output) + logger.info(f"Writing {len(releases)} releases to {output_file}") + with open(output_file, 'w') as f: + for release in releases: + f.write(json.dumps(release) + '\n') + + def write_contributors(self, contributors: Dict[str, str]) -> None: + """Write all contributors with their most recent contribution date as JSONL""" + output_file = self._get_output_path(self.config.contributors_output) + logger.info(f"Writing {len(contributors)} contributors to {output_file}") + with open(output_file, 'w') as f: + for login, date in contributors.items(): + f.write(json.dumps({"login": login, "last_contribution": date}) + '\n') + + def write_commits(self, commits: List[Dict[str, str]]) -> None: + """Write all commits as JSONL""" + output_file = self._get_output_path(self.config.commits_output) + logger.info(f"Writing {len(commits)} commits to {output_file}") + with open(output_file, 'w') as f: + for commit in commits: + f.write(json.dumps(commit) + '\n') + + def write_issues(self, issues: List[Dict[str, str]]) -> None: + """Write all issues with creator and status as JSONL""" + output_file = self._get_output_path(self.config.issues_output) + logger.info(f"Writing {len(issues)} issues to {output_file}") + with open(output_file, 'w') as f: + for issue in issues: + f.write(json.dumps(issue) + '\n') diff --git a/src/github_client.py b/src/github_client.py new file mode 100644 index 0000000..e214f6e --- /dev/null +++ b/src/github_client.py @@ -0,0 +1,173 @@ +#!/usr/bin/env python3 + +import os +import sys +import requests +import logging +from datetime import datetime, timedelta +from typing import Dict, Optional, Any +from config import Configuration + +logger = logging.getLogger(__name__) + +class GitHubClient: + def __init__(self, config: Configuration): + self.config = config + + # Check if GitHub token is set in config (from env var or config file) + self.github_token = self.config.github_token + + if not self.github_token: + logger.error("Please set GITHUB_TOKEN environment variable.") + sys.exit(1) + + # Headers for GraphQL API + self.headers = { + 'Authorization': f'bearer {self.github_token}', + 'Content-Type': 'application/json' + } + + # GraphQL endpoint + self.url = self.config.github_api_url + + # Calculate the date for specified days ago + self.date_range_ago = (datetime.now() - timedelta(days=self.config.date_range_days)).strftime('%Y-%m-%dT%H:%M:%SZ') + + # GraphQL Queries with configurable pagination limit + self.ROOT_FILES_QUERY = ''' + { + repository(owner: "%s", name: "%s") { + object(expression: "HEAD:") { + ... on Tree { + entries { + name + } + } + } + } + } + ''' + + self.LICENSE_QUERY = ''' + { + repository(owner: "%s", name: "%s") { + licenseInfo { + name + } + } + } + ''' + + self.RELEASES_QUERY = f''' + query($cursor: String) {{ + repository(owner: "%s", name: "%s") {{ + releases(first: {self.config.pagination_limit}, orderBy: {{field: CREATED_AT, direction: DESC}}, after: $cursor) {{ + edges {{ + node {{ + name + publishedAt + }} + cursor + }} + pageInfo {{ + hasNextPage + endCursor + }} + }} + }} + }} + ''' + + self.CONTRIBUTORS_QUERY = f''' + query($cursor: String, $since: GitTimestamp!) {{ + repository(owner: "%s", name: "%s") {{ + defaultBranchRef {{ + target {{ + ... on Commit {{ + history(first: {self.config.pagination_limit}, since: $since, after: $cursor) {{ + nodes {{ + author {{ + user {{ + login + }} + }} + committedDate + }} + pageInfo {{ + hasNextPage + endCursor + }} + }} + }} + }} + }} + }} + }} + ''' + + self.COMMITS_QUERY = f''' + query($cursor: String, $since: GitTimestamp!) {{ + repository(owner: "%s", name: "%s") {{ + defaultBranchRef {{ + target {{ + ... on Commit {{ + history(first: {self.config.pagination_limit}, since: $since, after: $cursor) {{ + nodes {{ + messageHeadline + committedDate + author {{ + name + }} + }} + pageInfo {{ + hasNextPage + endCursor + }} + }} + }} + }} + }} + }} + }} + ''' + + self.ISSUES_QUERY = f''' + query($cursor: String) {{ + repository(owner: "%s", name: "%s") {{ + issues(first: {self.config.pagination_limit}, states: [OPEN, CLOSED], after: $cursor, orderBy: {{field: CREATED_AT, direction: DESC}}) {{ + nodes {{ + title + state + author {{ + login + }} + createdAt + }} + pageInfo {{ + hasNextPage + endCursor + }} + }} + }} + }} + ''' + + def run_query(self, query: str, variables: Dict = {}) -> Optional[Dict[Any, Any]]: + """Run a GraphQL query with variables and return the response""" + try: + logger.debug(f"Running query with variables: {variables}") + response = requests.post( + self.url, + headers=self.headers, + json={'query': query, 'variables': variables}, + timeout=self.config.request_timeout + ) + if response.status_code == 200: + logger.debug("Query successful") + return response.json() + else: + logger.error(f"Query failed with status code {response.status_code}") + return None + except Exception as e: + logger.error(f"Query failed with exception: {e}") + return None diff --git a/src/metrics_check.py b/src/metrics_check.py new file mode 100644 index 0000000..a43a891 --- /dev/null +++ b/src/metrics_check.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python3 + +import sys +import logging +import os +from datetime import datetime +from github_client import GitHubClient +from metrics_processor import MetricsProcessor +from file_writer import FileWriter +from config import Configuration + +# Set up logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s', + handlers=[ + logging.StreamHandler(sys.stdout) + ] +) +logger = logging.getLogger(__name__) + +def main() -> None: + logger.info("Starting metrics check") + start_time = datetime.now() + + try: + # Initialize components + # Get config file from environment variable or use None to let Configuration class handle it + config_file = os.environ.get('CONFIG_FILE') + config = Configuration(config_file) + + # Check if owner and repo_name are set + if not config.owner or not config.repo_name: + logger.error("Please ensure repo.owner and repo.name are set either in config file or via environment variables (REPO_OWNER, REPO_NAME).") + sys.exit(1) + + github_client = GitHubClient(config) + metrics_processor = MetricsProcessor(github_client, config) + file_writer = FileWriter(config) + + owner = config.owner + repo_name = config.repo_name + + logger.info(f"Processing repository: {owner}/{repo_name}") + + # Process each metric + md_files = metrics_processor.get_root_md_files(owner, repo_name) + file_writer.write_root_md_files(md_files) + + license_name = metrics_processor.get_license(owner, repo_name) + file_writer.write_license(license_name) + + releases = metrics_processor.get_releases(owner, repo_name) + file_writer.write_releases(releases) + + contributors = metrics_processor.get_contributors(owner, repo_name) + file_writer.write_contributors(contributors) + + commits = metrics_processor.get_commits(owner, repo_name) + file_writer.write_commits(commits) + + issues = metrics_processor.get_issues(owner, repo_name) + file_writer.write_issues(issues) + + end_time = datetime.now() + duration = end_time - start_time + logger.info(f"Metrics check completed in {duration.total_seconds():.2f} seconds") + + except Exception as e: + logger.error(f"Application failed: {e}") + sys.exit(1) + +if __name__ == '__main__': + main() diff --git a/src/metrics_processor.py b/src/metrics_processor.py new file mode 100644 index 0000000..b3549d2 --- /dev/null +++ b/src/metrics_processor.py @@ -0,0 +1,230 @@ +#!/usr/bin/env python3 + +import logging +from typing import Dict, List, Tuple, Optional, Any +from github_client import GitHubClient +from config import Configuration + +logger = logging.getLogger(__name__) + +class MetricsProcessor: + def __init__(self, github_client: GitHubClient, config: Configuration): + self.github_client = github_client + self.config = config + + def get_root_md_files(self, owner: str, repo_name: str) -> List[str]: + """Get all .md files in the root folder""" + logger.info("Checking root .md files...") + query = self.github_client.ROOT_FILES_QUERY % (owner, repo_name) + result = self.github_client.run_query(query) + + md_files = [] + if result and 'data' in result and result['data']['repository']['object']: + entries = result['data']['repository']['object']['entries'] + md_files = [entry['name'] for entry in entries if entry['name'].endswith('.md')] + logger.info(f"Found {len(md_files)} .md files in root") + else: + logger.warning("No .md files found or error occurred") + return md_files + + def get_license(self, owner: str, repo_name: str) -> str: + """Get the repository license name""" + logger.info("Checking license...") + query = self.github_client.LICENSE_QUERY % (owner, repo_name) + result = self.github_client.run_query(query) + + license_name = 'None' + if result and 'data' in result: + license_info = result['data']['repository']['licenseInfo'] + license_name = license_info['name'] if license_info else 'None' + logger.info(f"License found: {license_name}") + else: + logger.warning("Error retrieving license information") + return license_name + + def _extract_releases(self, result: Dict) -> Tuple[List[Dict[str, str]], Optional[Dict]]: + """Extract releases from GraphQL response""" + releases = [] + page_info = None + + if 'data' in result and result['data']['repository']: + repo_data = result['data']['repository'] + if repo_data.get('releases'): + release_edges = repo_data['releases']['edges'] + page_info = repo_data['releases']['pageInfo'] + + # Filter releases from the past year + for edge in release_edges: + published_at = edge['node']['publishedAt'] + if published_at and published_at >= self.github_client.date_range_ago: + releases.append({ + 'name': edge['node']['name'] or 'Unnamed release', + 'publishedAt': published_at + }) + elif published_at and published_at < self.github_client.date_range_ago: + # Stop pagination if we've gone past the date range boundary + logger.debug(f"Reached releases older than {self.config.date_range_days} days, stopping pagination") + return releases, None + + return releases, page_info + + def get_releases(self, owner: str, repo_name: str) -> List[Dict[str, str]]: + """Get all releases with timestamps from the past date range""" + logger.info("Checking releases...") + query = self.github_client.RELEASES_QUERY % (owner, repo_name) + releases = self._paginate_github_query(query, self._extract_releases) + logger.info(f"Found {len(releases)} releases in the past {self.config.date_range_days} days") + return releases + + def _extract_contributors(self, result: Dict) -> Tuple[List[Dict[str, str]], Optional[Dict]]: + """Extract contributors from GraphQL response""" + contributors: Dict[str, str] = {} + page_info = None + + if 'data' in result and result['data']['repository']: + repo_data = result['data']['repository'] + if repo_data.get('defaultBranchRef') and repo_data['defaultBranchRef'].get('target'): + target = repo_data['defaultBranchRef']['target'] + if target.get('history'): + history = target['history'] + commit_nodes = history['nodes'] + page_info = history['pageInfo'] + + for commit in commit_nodes: + if commit.get('author') and commit['author'].get('user'): + login = commit['author']['user']['login'] + date = commit['committedDate'] + if login and date: + if login not in contributors or date > contributors[login]: + contributors[login] = date + + return [contributors], page_info + + def get_contributors(self, owner: str, repo_name: str) -> Dict[str, str]: + """Get all contributors with their most recent contribution date from the past date range""" + logger.info("Checking contributors...") + query = self.github_client.CONTRIBUTORS_QUERY % (owner, repo_name) + contributor_list = self._paginate_github_query(query, self._extract_contributors, {'since': self.github_client.date_range_ago}) + + # Merge all contributor dictionaries + final_contributors: Dict[str, str] = {} + for contributors in contributor_list: + for login, date in contributors.items(): + if login not in final_contributors or date > final_contributors[login]: + final_contributors[login] = date + + logger.info(f"Found {len(final_contributors)} contributors in the past {self.config.date_range_days} days") + return final_contributors + + def _extract_commits(self, result: Dict) -> Tuple[List[Dict[str, str]], Optional[Dict]]: + """Extract commits from GraphQL response""" + commits = [] + page_info = None + + if 'data' in result and result['data']['repository']: + repo_data = result['data']['repository'] + if repo_data.get('defaultBranchRef') and repo_data['defaultBranchRef'].get('target'): + target = repo_data['defaultBranchRef']['target'] + if target.get('history'): + history = target['history'] + commit_nodes = history['nodes'] + page_info = history['pageInfo'] + + for commit in commit_nodes: + commits.append({ + 'message': commit.get('messageHeadline', ''), + 'date': commit.get('committedDate', ''), + 'author': commit.get('author', {}).get('name', 'Unknown') if commit.get('author') else 'Unknown' + }) + + return commits, page_info + + def get_commits(self, owner: str, repo_name: str) -> List[Dict[str, str]]: + """Get all commits from the past date range""" + logger.info("Checking commits...") + query = self.github_client.COMMITS_QUERY % (owner, repo_name) + commits = self._paginate_github_query(query, self._extract_commits, {'since': self.github_client.date_range_ago}) + logger.info(f"Found {len(commits)} commits in the past {self.config.date_range_days} days") + return commits + + def _extract_issues(self, result: Dict) -> Tuple[List[Dict[str, str]], Optional[Dict]]: + """Extract issues from GraphQL response""" + issues = [] + page_info = None + + if 'data' in result and result['data']['repository']: + repo_data = result['data']['repository'] + if repo_data.get('issues'): + issue_nodes = repo_data['issues']['nodes'] + page_info = repo_data['issues']['pageInfo'] + + # Filter issues from the past date range + for issue in issue_nodes: + created_at = issue.get('createdAt') + if created_at and created_at >= self.github_client.date_range_ago: + issues.append({ + 'title': issue.get('title', ''), + 'state': issue.get('state', ''), + 'author': issue.get('author', {}).get('login', 'Unknown') if issue.get('author') else 'Unknown', + 'createdAt': created_at + }) + elif created_at and created_at < self.github_client.date_range_ago: + # Stop pagination if we've gone past the date range boundary + logger.debug(f"Reached issues older than {self.config.date_range_days} days, stopping pagination") + return issues, None + + return issues, page_info + + def get_issues(self, owner: str, repo_name: str) -> List[Dict[str, str]]: + """Get all issues with creator and status from the past date range""" + logger.info("Checking issues...") + query = self.github_client.ISSUES_QUERY % (owner, repo_name) + issues = self._paginate_github_query(query, self._extract_issues) + logger.info(f"Found {len(issues)} issues in the past {self.config.date_range_days} days") + return issues + + def _paginate_github_query( + self, + query: str, + extract_function, + initial_variables: Optional[Dict] = None + ) -> List[Any]: + """Generic pagination function for GitHub GraphQL API""" + if initial_variables is None: + initial_variables = {} + + all_data = [] + has_next_page = True + cursor = None + variables = initial_variables.copy() + + page_count = 0 + while has_next_page: + page_count += 1 + logger.info(f"Fetching page {page_count}...") + + if cursor: + variables['cursor'] = cursor + else: + # Remove cursor from variables if it's None + variables.pop('cursor', None) + + result = self.github_client.run_query(query, variables) + + if not result: + logger.warning("Query returned no result, stopping pagination") + break + + data_batch, page_info = extract_function(result) + all_data.extend(data_batch) + + logger.debug(f"Retrieved {len(data_batch)} items in this batch") + + if page_info and page_info.get('hasNextPage'): + cursor = page_info.get('endCursor') + logger.debug(f"Next cursor: {cursor}") + else: + has_next_page = False + + logger.info(f"Pagination complete. Total pages: {page_count}, Total items: {len(all_data)}") + return all_data