diff --git a/tools/issue-analysis/.gitignore b/tools/issue-analysis/.gitignore new file mode 100644 index 0000000000..e00522887d --- /dev/null +++ b/tools/issue-analysis/.gitignore @@ -0,0 +1,6 @@ +# Data directory (large JSON files) +data/ + +# Python cache +__pycache__/ +*.pyc diff --git a/tools/issue-analysis/ANALYZE_RESULTS_PLAN.md b/tools/issue-analysis/ANALYZE_RESULTS_PLAN.md new file mode 100644 index 0000000000..d1d3ec6306 --- /dev/null +++ b/tools/issue-analysis/ANALYZE_RESULTS_PLAN.md @@ -0,0 +1,529 @@ +# How to Create the Issue Analysis Report + +**Purpose:** This is a process guide for creating comprehensive issue analysis reports from the generated data. + +**Actual Analysis Location:** The analysis report itself goes in **`results/ISSUE_ANALYSIS.md`** (not this file). + +This document describes: +- What data is available in `results/*.txt` +- How to structure the analysis report +- What to include in each section +- How to prioritize findings + +## Objective + +The `results/ISSUE_ANALYSIS.md` should provide: +1. Executive summary of the Slang codebase quality +2. Deep dive into 10 most critical areas requiring attention +3. Data-driven recommendations for improving quality and reducing bugs + +## Data Sources + +### Phase 1: Analysis Outputs (Identify Problem Areas) + +Three complementary reports in `results/`: + +1. **general-analysis.txt** - Overall PR and issue trends + - Total issues: 3,573 + - Total PRs: 5,425 + - Bug fix rate: 26.1% (1,417 bug-fix PRs) + - Provides: PR velocity, issue trends, test coverage, file-level bug frequencies + +2. **critical-analysis.txt** - Critical bugs (crashes, ICEs, etc.) + - Total critical issues: 1,066 + - Critical bug-fix PRs: 702 + - Provides: Root causes, critical components, severity breakdown + +3. **bugfix-files-analysis.txt** - File and component bug patterns + - Analyzes all 1,417 bug-fix PRs + - Provides: Component-level metrics (fixes, changes, LOC) + - File-level bug fix frequencies + +### Phase 2: Raw Data (Deep-Dive for Evidence) + +Raw issue and PR data in `data/`: + +1. **data/issues.json** - All 3,573 issues + - Fields: number, title, body, labels, state, created_at, closed_at, comments, user + - Use for: Finding specific examples, understanding issue descriptions, analyzing patterns + +2. **data/pull_requests.json** - All 5,425 PRs + - Fields: number, title, body, labels, state, created_at, merged_at, files_changed + - files_changed includes: filename, additions, deletions, changes + - Use for: Understanding fixes, finding related issues, analyzing change patterns + +3. **data/critical_issues.csv** - Export of critical issues + - Pre-filtered critical issues for easier analysis + - Use for: Quick access to critical issue details + +## Analysis Structure + +### 1. Executive Summary (1 page) + +**Key Metrics** (extract from `results/*.txt`): +- Total issues and PRs analyzed +- Bug fix rate (% of PRs that are bug fixes) +- Critical issue count and types +- Test coverage statistics +- Average time to close issues/merge PRs + +**Trends** (calculate from raw data): +- Bug fix trend over time (increasing/decreasing) +- Critical issue trend over time +- PR velocity trend +- Test coverage trend + +**Overall Assessment** (synthesize from data): +- Overall quality trend (improving/stable/declining) +- Top 5 risk areas (from priority score analysis) +- Top 5 areas in open issues and PRs (current status) +- Key strengths (what's working well) +- Immediate attention needed (critical findings) + +### 2. Top 10 Critical Areas for Improvement + +Each area should include: + +**Format:** +``` +## Area N: [Component/System Name] + +**Severity:** Critical/High/Medium +**Impact:** [Scope of impact - crashes, correctness, performance, etc.] + +### Current State +- Bug frequency: X fixes per 1000 LOC +- Total bug fixes: X PRs +- Critical issues: X crashes/ICEs +- Test coverage: X% + +### Root Causes +1. [Primary cause with evidence] +2. [Secondary cause with evidence] +3. [Contributing factors] + +### Evidence +- File: [filename] - X fixes, Y LOC, Z fix frequency +- Issues: #[number], #[number] (examples) +- Patterns: [observed patterns from analysis] + +### Recommendations + +Prioritize recommendations into short-term and long-term actions (no specific timelines). +Order by priority within each category. + +**Short-term priorities:** +1. Action 1 with expected impact (highest priority quick win) +2. Action 2 with expected impact +3. Action 3 with expected impact + +**Long-term priorities:** +1. Strategic improvement 1 (highest priority long-term) +2. Strategic improvement 2 +3. Architectural or process change + +### Success Metrics +- Reduce bug fix frequency to < X per 1000 LOC +- Reduce critical issues by X% +- Achieve X% test coverage +``` + +## Discovering Critical Areas (No Assumptions) + +The analysis must identify problem areas purely from data. Do not make assumptions about which components are problematic. + +### Selection Criteria + +Select the top 10 areas based on data from `results/*.txt` files: + +1. **High Bug Fix Frequency (normalized by LOC)** + - From: "TOP 40 FILES BY BUG FIX FREQUENCY" sections + - Look for: Files with >50 fixes per 1000 LOC + - Group related files into components + +2. **High Absolute Bug Count** + - From: "ALL COMPONENTS BY BUG-FIX FREQUENCY" section + - Look for: Components with >100 total bug fixes + - Consider: Both total fixes and LOC for context + +3. **Critical Issue Concentration** + - From: "ROOT CAUSE COMPONENTS" section in critical-analysis.txt + - Look for: Components with >5 critical issues + - Prioritize: Crashes and ICEs over other issue types + +4. **Cross-Category Appearance** + - Components that appear in: + - High bug frequency lists + - Critical issue lists + - High change volume lists + - These are strong candidates for deep-dive + +5. **Test Coverage Gaps** + - From: General analysis test coverage sections + - Components with low test coverage AND high bug counts + - Indicates systemic quality issues + +### Decision Matrix + +For each component found in the data, calculate a priority score: + +``` +Priority Score = (Bug Fix Frequency × 0.3) + + (Critical Issues × 0.4) + + (Total Fixes / 100 × 0.2) + + (Cross-Category Appearances × 0.1) +``` + +Select the top 10 by priority score for detailed analysis. + +## Analysis Process + +### Phase 1: Identify Problem Areas (From Analysis Outputs) + +#### Step 1: Extract Key Metrics +```bash +# Get component-level statistics +grep -A 50 "ALL COMPONENTS BY BUG-FIX FREQUENCY" results/bugfix-files-analysis.txt + +# Get file-level bug frequencies +grep -A 40 "TOP 40 FILES BY BUG FIX FREQUENCY" results/bugfix-files-analysis.txt +grep -A 40 "TOP 40 FILES BY BUG FIX FREQUENCY" results/general-analysis.txt +grep -A 40 "TOP 40 FILES BY CRITICAL BUG FIX FREQUENCY" results/critical-analysis.txt + +# Get critical issue patterns +grep -A 20 "ROOT CAUSE COMPONENTS" results/critical-analysis.txt +grep -A 20 "CRITICAL ISSUES BY TYPE" results/critical-analysis.txt +``` + +#### Step 2: Cross-Reference Analysis +- Correlate high bug fix frequency with critical issues +- Identify components appearing in multiple problem categories +- Look for patterns in error types (crashes, ICEs, validation) +- Note files with disproportionately high bug fix frequency per LOC + +#### Step 3: Identify Top 10 Areas + +**A. Extract Component Metrics** + +Create a table with all components found in the analysis: + +``` +Component Name | Bug Fixes | Bug Fix Freq | LOC | Critical Issues | In Multiple Lists +---------------|-----------|--------------|-----|-----------------|------------------ +``` + +Sources: +- Bug Fixes: From "ALL COMPONENTS BY BUG-FIX FREQUENCY" +- Bug Fix Freq: From "TOP 40 FILES BY BUG FIX FREQUENCY" (average for component) +- LOC: From "ALL COMPONENTS BY BUG-FIX FREQUENCY" (LOC column) +- Critical Issues: From "ROOT CAUSE COMPONENTS" +- In Multiple Lists: Count how many analysis sections mention this component + +**B. Calculate Priority Scores** + +For each component: +```python +# Normalize values to 0-1 scale first +normalized_freq = bug_fix_freq / max_bug_fix_freq +normalized_critical = critical_issues / max_critical_issues +normalized_fixes = total_fixes / max_total_fixes +cross_category = appearances_count / 3 # Max 3 categories + +priority_score = (normalized_freq * 0.3) + \ + (normalized_critical * 0.4) + \ + (normalized_fixes * 0.2) + \ + (cross_category * 0.1) +``` + +**C. Select Top 10** + +- Sort components by priority score +- Take top 10 +- Exclude "test" and "docs" unless they show exceptional problems +- Include at least 2-3 high-frequency files even if they're in same component + (e.g., specific problematic files like slang.cpp, slang-compiler.cpp) + +**D. Validate Selection** + +Ensure selected areas: +- Represent actual code quality issues (not just high activity) +- Have actionable scope (not too broad like "all IR") +- Show clear patterns that suggest root causes +- Have sufficient data for deep-dive analysis + +### Phase 2: Deep-Dive Using Raw Data + +For each of the identified top 10 areas: + +#### Step 4: Find Specific Issues and PRs + +**Example: For "IR Optimization" area identified in Phase 1** + +```python +import json + +# Load raw data +with open('data/issues.json') as f: + issues = json.load(f) +with open('data/pull_requests.json') as f: + prs = json.load(f) + +# Find issues mentioning IR components +ir_issues = [ + issue for issue in issues + if 'slang-ir' in issue.get('title', '').lower() or + 'slang-ir' in (issue.get('body') or '').lower() +] + +# Find PRs that modified IR files +ir_prs = [ + pr for pr in prs + if any('slang-ir' in f['filename'] for f in pr.get('files_changed', [])) +] + +# Find critical IR issues +critical_ir = [ + issue for issue in ir_issues + if any(keyword in issue.get('title', '').lower() + for keyword in ['crash', 'ice', 'assert', 'segfault']) +] +``` + +#### Step 5: Extract Evidence + +For each critical area, gather: + +**Issue Examples:** +- Find 3-5 representative issues (issue number, title, key symptoms) +- Look for recurring patterns in issue descriptions +- Note user-reported impact + +**PR Analysis:** +- Examine files changed in bug-fix PRs +- Identify common fix patterns +- Calculate average time to fix +- Note if fixes clustered in specific time periods + +**Pattern Recognition:** +```python +# Example: Analyze issue titles for patterns +from collections import Counter + +keywords = [] +for issue in critical_ir: + title = issue['title'].lower() + # Extract meaningful keywords + for word in ['specialization', 'inlining', 'lowering', 'legalization', + 'optimization', 'transformation']: + if word in title: + keywords.append(word) + +pattern_frequency = Counter(keywords) +# Shows which IR operations are most problematic +``` + +#### Step 6: Root Cause Analysis + +For each area, synthesize evidence to identify: +- **Technical root causes**: Architecture issues, complexity, missing validation +- **Process root causes**: Insufficient testing, documentation gaps +- **Patterns**: Are bugs clustered in new features? Legacy code? Specific backends? + +Example questions to answer: +- What types of bugs are most common? (crashes vs. correctness vs. performance) +- Are bugs in new code or old code? +- Are certain code paths undertested? +- Is the component trying to do too much? +- Are there missing abstractions? + +#### Step 7: Prioritization + +Rank the 10 areas by: +1. **Impact**: Critical issues > Correctness > Performance > Usability +2. **Frequency**: Bug fix rate normalized by LOC +3. **Trend**: Use issue `created_at` dates to see if problems increasing/decreasing +4. **Blast radius**: How many users/features affected (check issue comment count, labels) +5. **Fix difficulty**: Average PR size, time to merge for fixes + +#### Step 8: Recommendation Development + +For each area, use evidence to develop prioritized recommendations. +Order by priority (most impactful first) within each category. + +**Short-term priorities:** +- Based on quick wins from PR patterns +- Focus on high-frequency, similar bugs +- High impact/effort ratio +- Can be implemented with existing architecture +- Example: "Add validation for X based on 15 similar crashes" + +**Long-term priorities:** +- Based on architectural issues seen in multiple PRs +- Requires significant refactoring or redesign +- Addresses fundamental design issues +- Higher effort but prevents entire classes of bugs +- Examples: + - "Refactor Y to reduce complexity (seen in 45 bug fixes)" + - "Redesign Z architecture (root cause of 30% of crashes)" + +### Step 9: Validate with Data + +For each recommendation: +- Show specific issue/PR numbers as evidence +- Calculate expected impact (e.g., "Could prevent 20% of IR crashes") +- Reference specific patterns from the data + +## Output Format + +The final `results/ISSUE_ANALYSIS.md` should include: +- Clear structure with table of contents +- Executive summary (1 page) +- Top 10 critical areas (detailed analysis) +- Appendices with supporting data +- No emojis +- Professional, data-driven tone +- Actionable recommendations with expected impact + +## Success Criteria + +The final analysis should: +- Be immediately actionable +- Provide clear prioritization (by priority order, not timelines) +- Include quantifiable metrics +- Show evidence-based reasoning +- Separate short-term and long-term priorities +- Define success criteria for each recommendation + +## How to Use This Guide + +### Prerequisites + +1. **Ensure data is available:** + ```bash + ls data/issues.json data/pull_requests.json # Raw data + ls results/*.txt # Analysis outputs + ``` + +2. **If data is missing, fetch it:** + ```bash + python3 fetch_github_issues.py # Fetches to data/ + ``` + +3. **Run all analysis scripts:** + ```bash + python3 analyze_issues.py > results/general-analysis.txt + python3 analyze_critical_issues.py > results/critical-analysis.txt + python3 analyze_bugfix_files.py > results/bugfix-files-analysis.txt + ``` + +### Creating the Analysis + +**Phase 1: Identify Top 10 Problem Areas** +- Follow Steps 1-3 in "Analysis Process" above +- Extract metrics from `results/*.txt` files +- Cross-reference to identify problem areas +- Select top 10 areas for deep-dive + +**Phase 2: Deep-Dive with Raw Data** +- Follow Steps 4-8 in "Analysis Process" above +- For each of the 10 areas, write Python scripts or use jq/grep to: + - Find relevant issues in `data/issues.json` + - Find relevant PRs in `data/pull_requests.json` + - Extract specific examples and evidence + - Identify patterns and root causes + +**Phase 3: Write the Report** +- Create `results/ISSUE_ANALYSIS.md` +- Follow the structure defined in this document +- Include data-backed evidence from both phases +- Add specific issue/PR numbers as examples + +**Phase 4: Validate** +- Ensure all 10 areas have concrete evidence +- Verify recommendations are actionable +- Check that success metrics are quantifiable +- Confirm the analysis meets all success criteria +- **Verify data-driven approach**: Every claim must reference specific data + - No assumptions about which components are problematic + - Priority scores calculated from actual metrics + - Issue/PR numbers cited for all examples + - Patterns backed by frequency counts from raw data + +### Tools for Deep-Dive Analysis + +**Command-line tools:** +```bash +# Find issues by keyword +jq '.[] | select(.title | contains("crash")) | {number, title}' data/issues.json + +# Find PRs modifying specific files +jq '.[] | select(.files_changed[]?.filename | contains("slang-ir")) | .number' data/pull_requests.json + +# Count issues by label +jq '[.[] | .labels[].name] | group_by(.) | map({label: .[0], count: length})' data/issues.json +``` + +**Python snippets:** +- See Step 4 in Analysis Process for examples +- Can create ad-hoc scripts to analyze patterns +- Use pandas for more complex analysis if needed + +## Common Pitfalls to Avoid + +### 1. Making Assumptions +**Don't:** +- Assume certain components are problematic based on intuition +- Pre-select areas to investigate +- Cherry-pick data to support preconceived notions + +**Do:** +- Let the data guide you to problem areas +- Follow the priority score methodology +- Be surprised by what the data shows + +### 2. Ignoring Context +**Don't:** +- Look only at absolute bug counts (large components naturally have more bugs) +- Ignore LOC when comparing components +- Compare components without considering their complexity + +**Do:** +- Always normalize by LOC (bugs per 1000 LOC) +- Consider component purpose (compiler core vs. utility functions) +- Look at bug fix frequency trends, not just snapshots + +### 3. Insufficient Evidence +**Don't:** +- Make recommendations without citing specific issues/PRs +- Generalize from 1-2 examples +- Rely solely on metrics without examining actual issues + +**Do:** +- Provide 3-5 concrete issue/PR examples per area +- Show patterns across multiple instances +- Quote actual issue descriptions and error messages +- Link metrics to real-world impact + +### 4. Vague Recommendations +**Don't:** +- Say "improve testing" without specifics +- Suggest "refactor component X" without identifying what to refactor +- Give recommendations without success metrics + +**Do:** +- Specify what to test (e.g., "Add validation tests for IR inlining edge cases") +- Identify specific code patterns to refactor +- Define measurable success criteria for each recommendation + +### 5. Analysis Staleness +**Don't:** +- Use outdated analysis outputs +- Assume patterns from old data still apply +- Mix data from different time periods + +**Do:** +- Re-run all analysis scripts before starting +- Note the data snapshot date in the report +- Consider trends over time, not just current state +- Update analysis regularly (quarterly recommended) + diff --git a/tools/issue-analysis/README.md b/tools/issue-analysis/README.md new file mode 100644 index 0000000000..34729c9633 --- /dev/null +++ b/tools/issue-analysis/README.md @@ -0,0 +1,280 @@ +# GitHub Issues Analysis Tools + +Tools to analyze Slang's GitHub issues and pull requests for identifying bug-prone components, and areas needing improvement. + +## Purpose + +These scripts help answer questions like: +- Which components have the most bugs? +- What files are changed most often for bug fixes? +- Where should we focus testing efforts? +- Which areas take longest to fix (complexity indicators)? +- What's the bug fix rate and test coverage? +- Are there patterns in crashes, ICEs, or validation errors? + +## Available Scripts + +### 1. `fetch_github_issues.py` +Downloads issue and PR data from GitHub API with full historical data. + +**Features:** +- Fetches all open and closed issues and PRs +- Extracts issue references from PR titles/bodies (e.g., "fixes #123") +- Downloads file changes for each PR +- Supports incremental updates (only fetch new/updated items) + +### 2. `analyze_issues.py` +General quality analysis across all issues and PRs. + +**Reports on:** +- Bug rate: 16.2% of issues, 35.5% of PRs +- Component-level bug distribution +- Time to close by component +- Test coverage by PR type +- Most frequently changed files +- Development velocity + +### 3. `analyze_critical_issues.py` +Deep dive into critical issues (crashes, ICEs, validation errors). + +**Reports on:** +- Critical issue categorization +- Root cause component analysis +- File-level hotspots for critical bugs +- Open critical issues ranked by urgency + +### 4. `analyze_bugfix_files.py` +File-level bug fix analysis. + +**Reports on:** +- Most bug-prone files and components +- Source file breakdown by component (30+ categories) +- Bug fix frequency by file type +- Top files per component + +## Quick Start + +### Step 1: Fetch Data + +```bash +cd tools/issue-analysis + +# Optional: Set GitHub token to avoid rate limits +export GITHUB_TOKEN="your_github_token_here" + +# Full fetch (~15-20 minutes, first time) +python3 fetch_github_issues.py + +# Incremental update (~30 seconds, subsequent runs) +python3 fetch_github_issues.py --incremental +``` + +**Note**: Without a GitHub token, you may hit rate limits (60 requests/hour). With a token, you get 5000 requests/hour. Create a token at: https://github.com/settings/tokens (only needs public repo read access) + +### Step 2: Run Analysis + +```bash +# General analysis (bugs, components, files, coverage) +python3 analyze_issues.py + +# Critical issues (crashes, ICEs, validation errors) +python3 analyze_critical_issues.py + +# Bug-fix file hotspots +python3 analyze_bugfix_files.py +``` + +## Data Format + +### Fetched Data + +After running `fetch_github_issues.py`, you'll have: + +``` +tools/issue-analysis/data/ +├── issues.json # All issues with full metadata +├── pull_requests.json # All PRs with files changed and issue references +├── metadata.json # Fetch timestamp and enrichment info +└── issues_detailed.csv # Generated by analyze_issues.py +``` + +### PR Data Structure + +Each PR includes: + +```json +{ + "number": 8999, + "title": "Fix SPIRV emission bug", + "state": "closed", + "referenced_issues": [123, 456], // Extracted from title/body + "files_changed": [ + { + "filename": "source/slang/slang-emit-spirv.cpp", + "status": "modified", + "additions": 25, + "deletions": 10, + "changes": 35 + } + ] +} +``` + +## Incremental Updates + +The `--incremental` flag makes subsequent fetches much faster: + +```bash +# After initial full fetch, use incremental for updates +python3 fetch_github_issues.py --incremental +``` + +**How it works:** +- Uses GitHub's `since` parameter to fetch only items updated after last fetch +- Automatically enriches only new/updated PRs (skips already-enriched data) +- Typically <100 API calls for weekly updates vs ~5,500 for full fetch + +**When to use:** +- ✅ Daily/weekly updates +- ✅ After checking out fresh code (if data exists) +- ❌ First time setup (no existing data) + +## Analysis Output + +### Console Reports + +Each analysis script prints a comprehensive report to the console with sections like: + +``` +TOP 15 COMPONENTS BY ISSUE COUNT +---------------------------------------------------------------------- +spirv 827 issues (233 bugs, 54 open bugs) +hlsl 703 issues (120 bugs, 18 open bugs) +glsl 661 issues (144 bugs, 29 open bugs) +... + +BUGS BY COMPONENT +---------------------------------------------------------------------- +spirv 233 bugs total ( 54 open, 179 closed) +glsl 144 bugs total ( 29 open, 115 closed) +hlsl 120 bugs total ( 18 open, 102 closed) +... + +TOP 20 MOST FREQUENTLY CHANGED FILES (Hot Spots) +---------------------------------------------------------------------- +548x source/slang/hlsl.meta.slang +505x source/slang/slang-lower-to-ir.cpp +502x source/slang/slang.cpp +... +``` + +### CSV Export + +`analyze_issues.py` exports `data/issues_detailed.csv` which can be imported into: +- Excel/Google Sheets for pivot tables and charts +- Jupyter Notebook for custom Python analysis +- SQL Database for complex queries +- Pandas for data science workflows + +## Component Categories + +The analysis automatically categorizes issues/PRs into 30+ components: + +**Compiler Pipeline:** +- `semantic-check` - Type checking and validation +- `parser` - Source code parsing +- `ir-generation` - AST to IR lowering +- `ir-passes` - IR transformations +- `compiler-core` - Core infrastructure + +**Code Generation:** +- `spirv-emit`, `hlsl-emit`, `glsl-emit`, `cuda-emit`, `metal-emit`, `dxil-emit` +- `emit-common` - Shared code generation logic + +**Standard Libraries:** +- `hlsl-stdlib`, `core-stdlib`, `stdlib` + +**Special Features:** +- `ir-autodiff` - Automatic differentiation +- `ir-legalization` - Type and layout legalization +- `ir-specialization` - Generic specialization +- `reflection` - Parameter binding and reflection + +**Infrastructure:** +- `test` - Test infrastructure +- `build-system` - CMake, CI/CD +- `gfx-rhi` - Graphics RHI layer +- `docs` - Documentation + +## Customization + +### Add New Component Patterns + +Edit `analyze_issues.py` or `analyze_bugfix_files.py`: + +```python +def extract_keywords(text): + # Add patterns here + patterns = { + 'my-component': [r'\bmy-keyword\b', r'\bother-keyword\b'], + ... + } +``` + +### Change Bug Detection + +Edit `analyze_issues.py`: + +```python +def is_bug_fix(item): + # Customize bug detection logic + # Currently looks for: labels, title patterns, critical keywords +``` + +### Export Different Formats + +Modify `print_report()` functions to export JSON, HTML, or other formats. + +## Workflow Examples + +### Report and Review + +```bash +# Update data (fast) +python3 fetch_github_issues.py --incremental + +# Generate reports +python3 analyze_issues.py > weekly-report.txt +python3 analyze_critical_issues.py > critical-issues.txt +``` + +## Tips + +1. **First run**: Use full fetch without `--incremental` (~15-20 minutes) +2. **Regular updates**: Use `--incremental` for speed (~30 seconds) +3. **Rate limits**: Set `GITHUB_TOKEN` to avoid hitting API limits +4. **Large output**: Pipe to file or `less` for easier browsing +5. **CSV analysis**: Import `issues_detailed.csv` into Excel for custom pivots + +## Requirements + +- Python 3.6+ +- No external dependencies (uses standard library only) +- GitHub token recommended (not required) + +## Troubleshooting + +**SSL Certificate Error:** +```bash +# macOS: Install certificates +pip3 install --upgrade certifi +``` + +**Rate Limit Error:** +- Set `GITHUB_TOKEN` environment variable +- Wait for rate limit reset (shown in error message) +- Use `--incremental` for subsequent runs + +**Missing Data:** +- Run `fetch_github_issues.py` first to download data +- Check `data/metadata.json` for fetch timestamp diff --git a/tools/issue-analysis/analyze_bugfix_files.py b/tools/issue-analysis/analyze_bugfix_files.py new file mode 100755 index 0000000000..b1e683ccd3 --- /dev/null +++ b/tools/issue-analysis/analyze_bugfix_files.py @@ -0,0 +1,225 @@ +#!/usr/bin/env python3 +""" +Analyze which files are most frequently changed in bug-fix PRs. +""" + +import re +from collections import Counter, defaultdict + +from analyze_common import get_file_loc, get_component_from_file, load_prs + +def is_bugfix_pr(pr): + """Determine if PR is a bug fix based on title and labels.""" + title = pr.get("title", "").lower() + body = (pr.get("body") or "").lower() + labels = [label["name"].lower() for label in pr.get("labels", [])] + + # Check labels + if any("bug" in label for label in labels): + return True, "labeled_bug" + if "regression" in labels: + return True, "regression" + + # Check title for fix keywords + fix_keywords = [ + r"\bfix\b", r"\bfixed\b", r"\bfixes\b", + r"\bcrash\b", r"\bice\b", + r"\bassert", r"\bassertfail", + r"\bcorrect\b", r"\brepair\b", + r"\bresolve\b", r"\bresolved\b" + ] + + for keyword in fix_keywords: + if re.search(keyword, title): + return True, "fix_keyword" + + return False, "" + +def categorize_file(filename): + """Categorize file by type.""" + if "test" in filename.lower(): + return "test" + elif filename.endswith((".h", ".hpp")): + return "header" + elif filename.endswith(".cpp"): + return "source" + elif filename.endswith(".slang"): + return "slang_code" + elif filename.endswith(".md"): + return "docs" + else: + return "other" + +def analyze_bugfix_files(prs): + """Analyze files changed in bug fix PRs.""" + + analysis = { + "total_prs": len(prs), + "bugfix_prs": 0, + "bugfix_by_type": Counter(), + "files_by_bugfix_count": Counter(), + "files_by_changes": Counter(), + "component_bugfix_count": Counter(), + "component_changes": Counter(), # Track changes per component + "component_loc": Counter(), # Track LOC per component + "file_type_distribution": Counter(), + "source_by_component": Counter(), # Track source files by component + "file_loc": {}, # NEW: Lines of code per file + "top_changed_per_component": defaultdict(Counter), + } + + bugfix_pr_list = [] + + for pr in prs: + is_bugfix, bugfix_type = is_bugfix_pr(pr) + if not is_bugfix: + continue + + if pr.get("state") != "closed": + continue # Only count merged bug fixes + + analysis["bugfix_prs"] += 1 + analysis["bugfix_by_type"][bugfix_type] += 1 + + bugfix_pr_list.append({ + "number": pr.get("number"), + "title": pr.get("title"), + "type": bugfix_type, + }) + + # Analyze files + files = pr.get("files_changed", []) + for file_info in files: + filename = file_info["filename"] + changes = file_info.get("changes", 0) + + analysis["files_by_bugfix_count"][filename] += 1 + analysis["files_by_changes"][filename] += changes + + # Get LOC for this file (cache it) + if filename not in analysis["file_loc"]: + analysis["file_loc"][filename] = get_file_loc(filename) + + # Categorize + file_type = categorize_file(filename) + analysis["file_type_distribution"][file_type] += 1 + + # Component + component = get_component_from_file(filename) + analysis["component_bugfix_count"][component] += 1 + analysis["component_changes"][component] += changes + + # Add LOC to component total + loc = analysis["file_loc"].get(filename) + if loc: + analysis["component_loc"][component] += loc + + # Track source files by component + if file_type == "source": + analysis["source_by_component"][component] += 1 + + # Track per-component files + if file_type in ["source", "header"]: + analysis["top_changed_per_component"][component][filename] += 1 + + return analysis, bugfix_pr_list + +def print_report(analysis): + """Print analysis report.""" + + print("\n" + "="*70) + print("BUG-FIX FILES ANALYSIS") + print("="*70) + + print(f"\nTotal PRs: {analysis['total_prs']}") + print(f"Bug-fix PRs (merged): {analysis['bugfix_prs']}") + print(f"Bug-fix rate: {(analysis['bugfix_prs'] / analysis['total_prs'] * 100):.1f}%") + + print("\n" + "-"*70) + print("BUG-FIX PR TYPES") + print("-"*70) + for bugfix_type, count in analysis["bugfix_by_type"].most_common(): + pct = (count / analysis['bugfix_prs'] * 100) + print(f"{bugfix_type:20} {count:4} ({pct:5.1f}%)") + + print("\n" + "-"*70) + print("TOP 40 FILES CHANGED IN BUG FIXES (by frequency)") + print("-"*70) + for filename, count in analysis["files_by_bugfix_count"].most_common(40): + changes = analysis["files_by_changes"][filename] + component = get_component_from_file(filename) + print(f"{count:3}x {changes:5} changes [{component:20}] {filename}") + + print("\n" + "-"*70) + print("TOP 40 FILES BY BUG FIX FREQUENCY (bug fix PRs per 1000 LOC) - source/ only") + print("-"*70) + + # Calculate bug fix frequency for files with known LOC + bug_density = [] + for filename, bugfix_count in analysis["files_by_bugfix_count"].items(): + loc = analysis["file_loc"].get(filename) + if loc and loc > 0: + # Only include source/header files under source/ directory + file_type = categorize_file(filename) + if file_type in ["source", "header"] and filename.startswith('source/'): + density = (bugfix_count / loc) * 1000 # bug fix PRs per 1000 LOC + bug_density.append((filename, bugfix_count, loc, density)) + + # Sort by density (highest first) + bug_density.sort(key=lambda x: x[3], reverse=True) + + for filename, bugfix_count, loc, density in bug_density[:40]: + component = get_component_from_file(filename) + print(f"{density:5.2f} {bugfix_count:3}x fixes {loc:6} LOC [{component:20}] {filename}") + + print("\n" + "-"*70) + print("ALL COMPONENTS BY BUG-FIX FREQUENCY") + print("-"*70) + print(f"{'Component':<30} {'Fixes':>6} {'Changes':>8} {'LOC':>10}") + print("-"*70) + for component, count in analysis["component_bugfix_count"].most_common(): + changes = analysis["component_changes"][component] + loc = analysis["component_loc"][component] + print(f"{component:<30} {count:6} {changes:8} {loc:10}") + + print("\n" + "-"*70) + print("FILE TYPE DISTRIBUTION IN BUG FIXES") + print("-"*70) + for file_type, count in analysis["file_type_distribution"].most_common(): + pct = (count / sum(analysis["file_type_distribution"].values()) * 100) + print(f"{file_type:15} {count:4} files ({pct:5.1f}%)") + + # Show second-level breakdown for source files + if file_type == "source" and analysis["source_by_component"]: + print(f" Source files by component (top 15):") + for component, src_count in analysis["source_by_component"].most_common(15): + src_pct = (src_count / count * 100) + print(f" {component:28} {src_count:4} files ({src_pct:5.1f}%)") + + # Top changed files per critical component + critical_components = ["spirv-emit", "ir-generation", "semantic-check", "ir-specialization", "type-system"] + for component in critical_components: + if component in analysis["top_changed_per_component"]: + print(f"\n" + "-"*70) + print(f"TOP FILES IN {component.upper()}") + print("-"*70) + for filename, count in analysis["top_changed_per_component"][component].most_common(10): + print(f"{count:3}x {filename}") + + print("\n" + "="*70) + +def main(): + """Main entry point.""" + print("Loading PR data...") + prs = load_prs() + + print("Analyzing bug-fix files...") + analysis, bugfix_prs = analyze_bugfix_files(prs) + + print_report(analysis) + + print("\n✓ Bug-fix file analysis complete!") + +if __name__ == "__main__": + main() + diff --git a/tools/issue-analysis/analyze_common.py b/tools/issue-analysis/analyze_common.py new file mode 100644 index 0000000000..2cbbf3e3bd --- /dev/null +++ b/tools/issue-analysis/analyze_common.py @@ -0,0 +1,146 @@ +#!/usr/bin/env python3 +""" +Common utilities for GitHub issue analysis scripts. +""" + +import json +import re +from pathlib import Path +from typing import List, Dict, Any, Optional + +# Data directory path +DATA_DIR = Path(__file__).parent / "data" + + +def get_file_loc(filepath: str) -> Optional[int]: + """Get lines of code for a file. + + Args: + filepath: Path to the file relative to repository root + + Returns: + Number of lines in the file, or None if file doesn't exist + """ + try: + full_path = Path(__file__).parent.parent.parent / filepath + if full_path.exists() and full_path.is_file(): + with open(full_path, 'r', encoding='utf-8', errors='ignore') as f: + return len(f.readlines()) + except: + pass + return None + + +def get_component_from_file(filename: str) -> str: + """Extract component from filename. + + Uses a systematic categorization scheme: + - Special categories (test, build-system, docs, etc.) are handled first + - Source files under source/ are categorized as: - + + Args: + filename: Path to the file + + Returns: + Component name (e.g., "slang-slang-ir", "test", "build-system") + """ + # Check for tests FIRST (before other patterns) + if "test" in filename.lower() or filename.startswith("tests/"): + return "test" + + # Build system and CI + if any(pattern in filename for pattern in ["CMakeLists.txt", "premake", ".github/workflows", + "build/visual-studio", ".vcxproj", "cmake/", + ".gitignore", "slang.sln", "CMakePresets.json", + "_build.sh", ".sh"]): + return "build-system" + + # Documentation + elif filename.startswith("docs/") or filename.endswith(".md"): + return "docs" + + # Examples + elif filename.startswith("examples/"): + return "examples" + + # External dependencies + elif filename.startswith("external/"): + return "external" + + # Prelude/runtime + elif filename.startswith("prelude/") or "prelude.h" in filename: + return "prelude" + + # Graphics/RHI layer + elif "tools/gfx/" in filename or "slang-gfx" in filename or "slang-rhi" in filename: + return "gfx-rhi" + + # Tools + elif "source/slangc/" in filename: + return "slangc-tool" + elif "tools/slang-generate/" in filename: + return "code-generation-tool" + elif "tools/platform/" in filename: + return "platform-tools" + + # Source files: extract as - + elif filename.startswith("source/"): + parts = filename.split('/') + if len(parts) >= 2: + dirname = parts[1] # e.g., "slang", "core", "compiler-core" + basename = parts[-1] # e.g., "slang-emit-spirv.cpp" + # Remove extension + basename = basename.rsplit('.', 1)[0] + # Split by dash or underscore and take first 2 parts + name_parts = re.split(r'[-_]', basename)[:2] + component_suffix = '-'.join(name_parts) + return f"{dirname}-{component_suffix}" + else: + return "source-other" + + else: + return "other" + + +def load_issues() -> List[Dict[str, Any]]: + """Load issues from JSON file. + + Returns: + List of issue dictionaries + + Raises: + SystemExit: If issues.json doesn't exist + """ + issues_file = DATA_DIR / "issues.json" + if not issues_file.exists(): + print(f"Error: {issues_file} not found. Run fetch_github_issues.py first.") + import sys + sys.exit(1) + + with open(issues_file) as f: + return json.load(f) + + +def load_prs() -> List[Dict[str, Any]]: + """Load pull requests from JSON file. + + Returns: + List of PR dictionaries, or empty list if file doesn't exist + """ + prs_file = DATA_DIR / "pull_requests.json" + if not prs_file.exists(): + print(f"Warning: {prs_file} not found. PR analysis will be skipped.") + return [] + + with open(prs_file) as f: + return json.load(f) + + +def load_all_data() -> tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: + """Load both issues and PRs. + + Returns: + Tuple of (issues, prs) + """ + return load_issues(), load_prs() + diff --git a/tools/issue-analysis/analyze_critical_issues.py b/tools/issue-analysis/analyze_critical_issues.py new file mode 100755 index 0000000000..9722570d7d --- /dev/null +++ b/tools/issue-analysis/analyze_critical_issues.py @@ -0,0 +1,350 @@ +#!/usr/bin/env python3 +""" +Deep analysis of critical issues: crashes, compiler errors, and their root causes. +""" + +import re +import csv +from collections import Counter, defaultdict +from typing import Dict, List, Any, Tuple + +from analyze_common import get_file_loc, get_component_from_file, load_all_data, DATA_DIR + +# Patterns to identify issue types +CRASH_PATTERNS = [ + r"crash", + r"segfault", + r"segmentation fault", + r"access violation", + r"assertion.*failed", + r"abort", + r"core dump", + r"fatal error", +] + +ERROR_PATTERNS = { + "ice": r"internal compiler error|ICE", + "assertion": r"assertion.*failed|assert\(", + "access_violation": r"access violation|segmentation fault|segfault", + "null_pointer": r"null.*pointer|nullptr|null reference", + "stack_overflow": r"stack overflow", + "memory": r"out of memory|memory allocation failed", + "infinite_loop": r"infinite loop|hangs|freezes", + "validation": r"validation.*fail|invalid.*spirv|spirv.*validation", + "link_error": r"link.*error|unresolved.*symbol", + "codegen": r"incorrect.*code|wrong.*code|bad.*codegen", +} + +def is_critical_issue(issue: Dict[str, Any]) -> Tuple[bool, str]: + """Check if issue is critical and return the type.""" + title = issue.get("title", "").lower() + body = (issue.get("body") or "").lower() + combined = f"{title} {body}" + + # Check for crashes + for pattern in CRASH_PATTERNS: + if re.search(pattern, combined, re.IGNORECASE): + return True, "crash" + + # Check for other critical errors + for error_type, pattern in ERROR_PATTERNS.items(): + if re.search(pattern, combined, re.IGNORECASE): + return True, error_type + + # Check labels + labels = [label["name"].lower() for label in issue.get("labels", [])] + if any("bug" in label for label in labels): + return True, "bug" + + return False, "" + +def extract_component_from_text(text: str) -> List[str]: + """Extract component mentions from text using unified categorization. + + Searches for file paths in issue text and categorizes them using + get_component_from_file() for consistency with other analysis scripts. + """ + components = set() + + # Common file path patterns in Slang + # Match paths like "source/slang/slang-emit-spirv.cpp" or just "slang-emit-spirv.cpp" + file_patterns = [ + r'source/[\w/\-\.]+\.(?:cpp|h|hpp)', # Full source paths + r'slang-[\w\-]+\.(?:cpp|h|hpp)', # Slang files by name + r'tools/[\w/\-\.]+\.(?:cpp|h|hpp)', # Tools paths + ] + + for pattern in file_patterns: + matches = re.findall(pattern, text, re.IGNORECASE) + for match in matches: + # Use unified component categorization + component = get_component_from_file(match) + if component != "other": # Only add if we found a specific component + components.add(component) + + return list(components) + +def extract_error_messages(body: str) -> List[str]: + """Extract error messages from issue body.""" + if not body: + return [] + + errors = [] + # Look for code blocks with errors + code_blocks = re.findall(r"```[\s\S]*?```", body) + for block in code_blocks: + # Look for error-like lines + for line in block.split('\n'): + if re.search(r"error|Error|ERROR|fail|Fail|FAIL", line): + errors.append(line.strip()) + + # Look for quoted errors + quoted = re.findall(r'`[^`]*(?:error|fail)[^`]*`', body, re.IGNORECASE) + errors.extend(quoted) + + return errors[:5] # Limit to first 5 errors + +def is_critical_pr(pr: Dict[str, Any]) -> Tuple[bool, str]: + """Check if PR is fixing a critical issue.""" + title = pr.get("title", "").lower() + body = (pr.get("body") or "").lower() + combined = f"{title} {body}" + + # Check for crash fixes + for pattern in CRASH_PATTERNS: + if re.search(pattern, combined, re.IGNORECASE): + return True, "crash_fix" + + # Check for ICE fixes + if re.search(r"ice|internal compiler error", combined, re.IGNORECASE): + return True, "ice_fix" + + # Check for validation fixes + if re.search(r"validation|invalid.*spirv", combined, re.IGNORECASE): + return True, "validation_fix" + + # Check for assertion fixes + if re.search(r"assertion.*fail", combined, re.IGNORECASE): + return True, "assertion_fix" + + return False, "" + +def analyze_critical_issues(issues: List[Dict[str, Any]], prs: List[Dict[str, Any]]) -> Dict[str, Any]: + """Analyze critical issues in detail.""" + + analysis = { + "by_type": Counter(), + "by_component": Counter(), + "by_state": Counter(), + "by_year": Counter(), + "open_critical": [], + "error_patterns": Counter(), + "root_cause_components": Counter(), + "critical_with_prs": 0, + "critical_without_fix": 0, + } + + critical_issues = [] + + for issue in issues: + is_crit, crit_type = is_critical_issue(issue) + if not is_crit: + continue + + critical_issues.append(issue) + + # Categorize + analysis["by_type"][crit_type] += 1 + analysis["by_state"][issue.get("state", "unknown")] += 1 + + # Year + created = issue.get("created_at", "") + if created: + year = created[:4] + analysis["by_year"][year] += 1 + + # Extract components from title and body + title = issue.get("title", "") + body = issue.get("body", "") or "" + combined = f"{title}\n{body}" + + components = extract_component_from_text(combined) + for comp in components: + analysis["by_component"][comp] += 1 + analysis["root_cause_components"][comp] += 1 + + # Track open critical issues + if issue.get("state") == "open": + analysis["open_critical"].append({ + "number": issue.get("number"), + "title": issue.get("title"), + "type": crit_type, + "comments": issue.get("comments", 0), + "created_at": issue.get("created_at"), + "labels": [l["name"] for l in issue.get("labels", [])], + "components": components, + }) + + # Check if has related PRs + if issue.get("related_prs"): + analysis["critical_with_prs"] += 1 + elif issue.get("state") == "closed": + analysis["critical_without_fix"] += 1 + + # Analyze files involved in critical bug fixes from PRs + critical_bug_files = Counter() + critical_bug_files_by_changes = Counter() + file_loc = {} + critical_pr_count = 0 + + for pr in prs: + if pr.get("state") != "closed": + continue + + is_crit_pr, crit_pr_type = is_critical_pr(pr) + if not is_crit_pr: + continue + + critical_pr_count += 1 + + files = pr.get("files_changed", []) + for file_info in files: + filename = file_info["filename"] + changes = file_info.get("changes", 0) + + # Only count source files, not tests + if "test" not in filename.lower(): + critical_bug_files[filename] += 1 + critical_bug_files_by_changes[filename] += changes + + # Get LOC for this file (cache it) + if filename not in file_loc: + file_loc[filename] = get_file_loc(filename) + + analysis["critical_bug_files"] = critical_bug_files + analysis["critical_bug_files_by_changes"] = critical_bug_files_by_changes + analysis["file_loc"] = file_loc + analysis["critical_pr_count"] = critical_pr_count + analysis["total_critical"] = len(critical_issues) + + return analysis + +def print_critical_report(analysis: Dict[str, Any]): + """Print detailed critical issues report.""" + + print("\n" + "="*70) + print("CRITICAL ISSUES DEEP DIVE ANALYSIS") + print("="*70) + + print(f"\nTotal critical issues: {analysis['total_critical']}") + print(f"Open: {analysis['by_state'].get('open', 0)}") + print(f"Closed: {analysis['by_state'].get('closed', 0)}") + print(f"\nCritical bug-fix PRs analyzed: {analysis.get('critical_pr_count', 0)}") + + # Show file hotspots first - most actionable info + if analysis["critical_bug_files"]: + print("\n" + "-"*70) + print("TOP 40 FILES MOST OFTEN FIXED FOR CRITICAL BUGS") + print("-"*70) + changes_by_file = analysis.get("critical_bug_files_by_changes", {}) + for filename, count in analysis["critical_bug_files"].most_common(40): + changes = changes_by_file.get(filename, 0) + print(f"{count:3}x {changes:5} changes {filename}") + + # Show critical bug fix frequency + if analysis["critical_bug_files"] and analysis.get("file_loc"): + print("\n" + "-"*70) + print("TOP 40 FILES BY CRITICAL BUG FIX FREQUENCY (critical fixes per 1000 LOC) - source/ only") + print("-"*70) + + # Calculate critical bug fix frequency for files with known LOC + critical_density = [] + for filename, bugfix_count in analysis["critical_bug_files"].items(): + loc = analysis["file_loc"].get(filename) + if loc and loc > 0: + # Only include source files under source/ directory + if filename.startswith('source/') and filename.endswith(('.cpp', '.h', '.hpp', '.c')): + density = (bugfix_count / loc) * 1000 # critical bug fix PRs per 1000 LOC + critical_density.append((filename, bugfix_count, loc, density)) + + # Sort by density (highest first) + critical_density.sort(key=lambda x: x[3], reverse=True) + + for filename, bugfix_count, loc, density in critical_density[:40]: + print(f"{density:5.2f} {bugfix_count:3}x fixes {loc:6} LOC {filename}") + + print("\n" + "-"*70) + print("CRITICAL ISSUE TYPES") + print("-"*70) + for issue_type, count in analysis["by_type"].most_common(20): + open_count = len([i for i in analysis["open_critical"] if i["type"] == issue_type]) + print(f"{issue_type:25} {count:4} total ({open_count:3} open)") + + print("\n" + "-"*70) + print("ROOT CAUSE COMPONENTS (Critical Issues)") + print("-"*70) + if analysis["by_component"]: + for component, count in analysis["by_component"].most_common(15): + print(f"{component:30} {count:4} critical issues") + else: + print("No component-level data available (file mentions in issues)") + print("Run with --pr-files flag for file-level analysis") + + print("\n" + "-"*70) + print("CRITICAL ISSUES BY YEAR") + print("-"*70) + for year, count in sorted(analysis["by_year"].items()): + print(f"{year:10} {count:4}") + + print("\n" + "-"*70) + print(f"TOP 20 OPEN CRITICAL ISSUES (by discussion volume)") + print("-"*70) + open_crit = sorted(analysis["open_critical"], key=lambda x: x["comments"], reverse=True)[:20] + for issue in open_crit: + components_str = ",".join(issue["components"][:2]) if issue["components"] else "unknown" + print(f"#{issue['number']:5} [{issue['type']:15}] ({issue['comments']:2} comments) {components_str:20} {issue['title'][:40]}") + + print("\n" + "="*70) + +def export_critical_csv(analysis: Dict[str, Any]): + """Export critical issues to CSV.""" + import csv + + output_file = DATA_DIR / "critical_issues.csv" + with open(output_file, "w", newline="", encoding="utf-8") as f: + writer = csv.writer(f) + writer.writerow([ + "number", "title", "type", "state", "created_at", + "comments", "labels", "components" + ]) + + for issue in analysis["open_critical"]: + writer.writerow([ + issue["number"], + issue["title"], + issue["type"], + "open", + issue["created_at"], + issue["comments"], + "|".join(issue["labels"]), + "|".join(issue["components"]), + ]) + + print(f"\nCritical issues CSV exported to: {output_file}") + +def main(): + """Main entry point.""" + print("Loading data...") + issues, prs = load_all_data() + + print("Analyzing critical issues...") + analysis = analyze_critical_issues(issues, prs) + + print_critical_report(analysis) + export_critical_csv(analysis) + + print("\n✓ Critical issues analysis complete!") + +if __name__ == "__main__": + main() + diff --git a/tools/issue-analysis/analyze_issues.py b/tools/issue-analysis/analyze_issues.py new file mode 100755 index 0000000000..fc228863d8 --- /dev/null +++ b/tools/issue-analysis/analyze_issues.py @@ -0,0 +1,705 @@ +#!/usr/bin/env python3 +""" +Analyze GitHub issues and pull requests. +""" + +import re +import json +from collections import Counter, defaultdict +from datetime import datetime +from typing import Dict, List, Any, Tuple +import csv + +from analyze_common import get_file_loc, get_component_from_file, load_issues, load_prs, DATA_DIR + +# Common source paths in Slang +SOURCE_PATTERNS = { + "compiler-core": r"source/compiler-core/", + "slang-core": r"source/slang/", + "slang-ir": r"source/slang/slang-ir-.*\.cpp", + "slang-emit": r"source/slang/slang-emit-.*\.cpp", + "slang-check": r"source/slang/slang-check.*\.cpp", + "parser": r"source/slang/slang-parser\.cpp", + "lexer": r"source/compiler-core/slang-lexer\.cpp", + "preprocessor": r"source/slang/slang-preprocessor\.cpp", + "spirv": r"spirv|SPIRV|spir-v|SPIR-V", + "dxil": r"dxil|DXIL|DirectX", + "cuda": r"cuda|CUDA", + "metal": r"metal|Metal|MSL", + "glsl": r"glsl|GLSL", + "hlsl": r"hlsl|HLSL", + "wgsl": r"wgsl|WGSL|WebGPU", + "autodiff": r"autodiff|auto-diff|differentiation", + "generics": r"generic|template", + "cooperative-matrix": r"cooperative.matrix|CooperativeMatrix", +} + +def extract_keywords(text: str) -> List[str]: + """Extract relevant keywords from issue text.""" + if not text: + return [] + + keywords = [] + text_lower = text.lower() + + # Check for each pattern + for category, pattern in SOURCE_PATTERNS.items(): + if re.search(pattern, text, re.IGNORECASE): + keywords.append(category) + + return keywords + +def is_feature(item: Dict[str, Any]) -> bool: + """Determine if an issue/PR is a feature addition.""" + title = item.get("title", "").lower() + body = (item.get("body") or "").lower() + labels = [label["name"].lower() for label in item.get("labels", [])] + + # Check labels first (most reliable) + feature_labels = ["feature", "enhancement", "new feature", "feature request"] + if any(label in feature_labels for label in labels): + return True + + # Check title/body for feature keywords + feature_patterns = [ + r"\b(add|implement|introduce|support|enable)\s+(new\s+)?feature", + r"\benhancement\b", + r"\bnew\s+(functionality|capability|support|API|feature)", + r"\bimplement\s+(support\s+for|new)", + r"\bintroduce\s+", + ] + + combined = f"{title} {body}" + for pattern in feature_patterns: + if re.search(pattern, combined, re.IGNORECASE): + return True + + return False + +def is_bug_fix(item: Dict[str, Any]) -> bool: + """Determine if an issue/PR is a bug fix.""" + title = item.get("title", "").lower() + body = (item.get("body") or "").lower() + labels = [label["name"].lower() for label in item.get("labels", [])] + + # Check labels for bugs (including infrastructure bugs like CI Bug) + bug_labels = ["regression", "known_issues"] + if any(bug_label in label for label in labels for bug_label in bug_labels): + return True + + # "bug" in label (includes "CI Bug", "Vendor Driver Bug", etc.) + for label in labels: + # Match any label with "bug" but exclude "GoodFirstBug" (that's a difficulty marker, not bug type) + if "bug" in label and "goodfirstbug" not in label: + return True + + # Check title/body for bug fix patterns (IMPROVED!) + # Primary pattern: Look for "bug" or "fix" combinations + combined = f"{title} {body}" + + # Pattern 1: Explicit bug mentions + bug_patterns = [ + r"\bbugfix\b", # "bugfix" as single word + r"\bbug[\s-]fix", # "bug fix" or "bug-fix" + r"\bfix\b.*\bbug\b", # "fix ... bug" anywhere in text + r"\bbug\b.*\bfix", # "bug ... fix" anywhere in text + ] + + for pattern in bug_patterns: + if re.search(pattern, combined, re.IGNORECASE): + return True + + # Pattern 2: References to issue numbers (usually bug fixes) + issue_ref_patterns = [ + r"\bfix(es|ed|ing)?\s+#\d+", # "fix #123", "fixes #456" + r"\bresolve(s|d)?\s+#\d+", # "resolve #123", "resolved #456" + r"\bclose(s|d)?\s+#\d+", # "close #123", "closes #456" + ] + + for pattern in issue_ref_patterns: + if re.search(pattern, combined, re.IGNORECASE): + return True + + # Pattern 2b: "Fix X" at start of title (common bug fix pattern) + # But exclude obvious features: "Fix formatting", "Fix typo", "Fix comment" + title_only = title.strip() + if re.match(r"^fix(es|ed|ing)?\s+", title_only, re.IGNORECASE): + # Exclude non-bug fixes + non_bugs = [r"typo", r"comment", r"formatting", r"whitespace", r"style", + r"documentation", r"readme", r"license"] + if not any(re.search(nb, title_only, re.IGNORECASE) for nb in non_bugs): + return True + + # Pattern 3: Critical error keywords (these are always bugs) + critical_patterns = [ + r"\bcrash(es|ed|ing)?\b", # crash, crashes, crashing + r"\bsegfault", # segmentation fault + r"\bsegmentation\s+fault", # segmentation fault + r"\bassert(ion)?\s+fail", # assertion fail, assertion failed + r"\binternal\s+compiler\s+error", # ICE long form + r"\bICE\b", # ICE abbreviation + r"\bnull\s+pointer", # null pointer issues + r"\bmemory\s+leak", # memory leaks + r"\buse[\s-]after[\s-]free", # use-after-free + r"\binfinite\s+loop", # infinite loops + r"\bhang(s|ing)?\b", # hangs, hanging + ] + + for pattern in critical_patterns: + if re.search(pattern, combined, re.IGNORECASE): + return True + + # Pattern 4: Correctness issues (these are bugs) + correctness_patterns = [ + r"\bincorrect\s+(output|code|behavior|result|codegen)", + r"\binvalid\s+(code|output|spirv|hlsl|glsl)", + r"\bwrong\s+(output|code|result)", + r"\bvalidation\s+(error|fail)", + r"\bmiscompil", # miscompile, miscompilation + ] + + for pattern in correctness_patterns: + if re.search(pattern, combined, re.IGNORECASE): + return True + + return False + +def categorize_issues(issues: List[Dict[str, Any]]) -> Dict[str, Any]: + """Categorize issues by various dimensions.""" + + categories = { + "by_label": Counter(), + "by_component": Counter(), + "by_state": Counter(), + "by_year": Counter(), + "by_type": {"features": 0, "bugs": 0, "other": 0}, # NEW: Issue type breakdown + "bugs_by_component": Counter(), + "open_bugs_by_component": Counter(), + } + + bug_issues = [] + feature_issues = [] # NEW + crash_issues = [] + compiler_errors = [] + codegen_issues = [] + + for issue in issues: + # Basic categorization + state = issue.get("state", "unknown") + categories["by_state"][state] += 1 + + # Year + created_at = issue.get("created_at", "") + if created_at: + year = created_at[:4] + categories["by_year"][year] += 1 + + # Labels + labels = [label["name"] for label in issue.get("labels", [])] + for label in labels: + categories["by_label"][label] += 1 + + # Extract component from title and body + title = issue.get("title", "") + body = issue.get("body", "") or "" + combined_text = f"{title} {body}" + + keywords = extract_keywords(combined_text) + for keyword in keywords: + categories["by_component"][keyword] += 1 + + # Classify issue type (NEW!) + is_feat = is_feature(issue) + is_bug_issue = is_bug_fix(issue) + + if is_feat: + issue_type = "features" + feature_issues.append(issue) + elif is_bug_issue: + issue_type = "bugs" + bug_issues.append(issue) + # Track bugs by component + for keyword in keywords: + categories["bugs_by_component"][keyword] += 1 + if state == "open": + categories["open_bugs_by_component"][keyword] += 1 + else: + issue_type = "other" + + categories["by_type"][issue_type] += 1 + + # Additional classifications + is_crash = "crash" in combined_text.lower() + has_error = re.search(r"error|fail|invalid|incorrect", combined_text, re.IGNORECASE) + is_codegen = any(k in keywords for k in ["spirv", "dxil", "cuda", "metal", "glsl", "hlsl", "wgsl"]) + + if is_crash: + crash_issues.append(issue) + + if has_error: + compiler_errors.append(issue) + + if is_codegen: + codegen_issues.append(issue) + + return { + "categories": categories, + "bug_issues": bug_issues, + "feature_issues": feature_issues, # NEW + "crash_issues": crash_issues, + "compiler_errors": compiler_errors, + "codegen_issues": codegen_issues, + } + +def analyze_time_to_close(issues: List[Dict[str, Any]]) -> Dict[str, float]: + """Calculate average time to close issues by component.""" + component_times = defaultdict(list) + + for issue in issues: + if issue.get("state") != "closed": + continue + + created_at = datetime.fromisoformat(issue["created_at"].replace("Z", "+00:00")) + closed_at = datetime.fromisoformat(issue["closed_at"].replace("Z", "+00:00")) + days_to_close = (closed_at - created_at).days + + title = issue.get("title", "") + body = issue.get("body", "") or "" + keywords = extract_keywords(f"{title} {body}") + + for keyword in keywords: + component_times[keyword].append(days_to_close) + + avg_times = {} + for component, times in component_times.items(): + if times: + avg_times[component] = sum(times) / len(times) + + return avg_times + +def analyze_prs(prs: List[Dict[str, Any]]) -> Dict[str, Any]: + """Analyze pull requests.""" + if not prs: + return {} + + pr_analysis = { + "by_state": Counter(), + "by_component": Counter(), + "by_year": Counter(), + "by_type": {"bug_fixes": 0, "other": 0}, # PR type breakdown: bugs vs other + "files_by_component": Counter(), + "most_changed_files": Counter(), + "bugfix_files": Counter(), # Track files changed in bug fix PRs + "file_loc": {}, # Lines of code per file + "test_coverage": {"with_tests": 0, "without_tests": 0}, + "test_coverage_by_type": { # Test coverage per type + "bug_fixes": {"with_tests": 0, "without_tests": 0}, + "other": {"with_tests": 0, "without_tests": 0}, + }, + "avg_time_to_merge": {}, + } + + component_merge_times = defaultdict(list) + files_changed_count = [] + all_merge_times = [] + + bug_fix_prs = [] + other_prs = [] + + for pr in prs: + # Basic stats + state = pr.get("state", "unknown") + pr_analysis["by_state"][state] += 1 + + # Year + created_at = pr.get("created_at", "") + if created_at: + year = created_at[:4] + pr_analysis["by_year"][year] += 1 + + # Classify PR type: bug fix or other + is_bug = is_bug_fix(pr) + + if is_bug: + pr_type = "bug_fixes" + bug_fix_prs.append(pr) + else: + pr_type = "other" + other_prs.append(pr) + + pr_analysis["by_type"][pr_type] += 1 + + # Extract components from title and body + title = pr.get("title", "") + body = pr.get("body", "") or "" + combined = f"{title} {body}" + keywords = extract_keywords(combined) + + for keyword in keywords: + pr_analysis["by_component"][keyword] += 1 + + # Analyze files if available + files = pr.get("files_changed", []) + if files: + files_changed_count.append(len(files)) + + # Check for test files + has_test = any("test" in f["filename"].lower() for f in files) + if has_test: + pr_analysis["test_coverage"]["with_tests"] += 1 + pr_analysis["test_coverage_by_type"][pr_type]["with_tests"] += 1 + else: + pr_analysis["test_coverage"]["without_tests"] += 1 + pr_analysis["test_coverage_by_type"][pr_type]["without_tests"] += 1 + + # Track file changes + for file_info in files: + filename = file_info["filename"] + pr_analysis["most_changed_files"][filename] += 1 + + # Track bug fix files specifically + if is_bug: + pr_analysis["bugfix_files"][filename] += 1 + + # Get LOC for this file (cache it) + if filename not in pr_analysis["file_loc"]: + pr_analysis["file_loc"][filename] = get_file_loc(filename) + + # Categorize by component + for keyword in keywords: + pr_analysis["files_by_component"][keyword] += 1 + + # Time to merge for closed PRs (use closed_at as proxy for merged_at) + # Note: /issues API doesn't include merged_at, so we use closed_at for closed PRs + if pr.get("state") == "closed" and pr.get("closed_at") and pr.get("created_at"): + try: + created = datetime.fromisoformat(pr["created_at"].replace("Z", "+00:00")) + closed = datetime.fromisoformat(pr["closed_at"].replace("Z", "+00:00")) + days_to_close = (closed - created).days + + all_merge_times.append(days_to_close) + + # Track by component + for keyword in keywords: + component_merge_times[keyword].append(days_to_close) + except (ValueError, TypeError): + pass # Skip if date parsing fails + + # Calculate average merge times + for component, times in component_merge_times.items(): + if times: + pr_analysis["avg_time_to_merge"][component] = sum(times) / len(times) + + # Overall average merge time + if all_merge_times: + pr_analysis["overall_avg_merge_time"] = sum(all_merge_times) / len(all_merge_times) + pr_analysis["median_merge_time"] = sorted(all_merge_times)[len(all_merge_times) // 2] + pr_analysis["merged_pr_count"] = len(all_merge_times) + else: + pr_analysis["overall_avg_merge_time"] = 0 + pr_analysis["median_merge_time"] = 0 + pr_analysis["merged_pr_count"] = 0 + + # Average files changed per PR + if files_changed_count: + pr_analysis["avg_files_per_pr"] = sum(files_changed_count) / len(files_changed_count) + else: + pr_analysis["avg_files_per_pr"] = 0 + + return pr_analysis + +def print_report(analysis: Dict[str, Any], issues: List[Dict[str, Any]]): + """Print analysis report.""" + cats = analysis["categories"] + + print("\n" + "="*70) + print("SLANG GITHUB ISSUES AND PULL REQUESTS ANALYSIS") + print("="*70) + + # Load metadata + metadata_file = DATA_DIR / "metadata.json" + if metadata_file.exists(): + with open(metadata_file) as f: + metadata = json.load(f) + print(f"\nData fetched: {metadata['fetched_at']}") + print(f"Repository: {metadata['repo']}") + + print(f"\nTotal issues analyzed: {len(issues)}") + print(f"Open issues: {cats['by_state']['open']}") + print(f"Closed issues: {cats['by_state']['closed']}") + + # Issue Type Breakdown (NEW!) + print("\n" + "-"*70) + print("ISSUE TYPE BREAKDOWN") + print("-"*70) + by_type = cats.get("by_type", {}) + total_typed = sum(by_type.values()) + if total_typed > 0: + feat_count = by_type.get("features", 0) + bug_count = by_type.get("bugs", 0) + other_count = by_type.get("other", 0) + + feat_pct = (feat_count / total_typed) * 100 + bug_pct = (bug_count / total_typed) * 100 + other_pct = (other_count / total_typed) * 100 + + print(f"Feature Requests: {feat_count:4} issues ({feat_pct:5.1f}%)") + print(f"Bug Reports: {bug_count:4} issues ({bug_pct:5.1f}%)") + print(f"Other: {other_count:4} issues ({other_pct:5.1f}%)") + print(f"\nBug report rate: {bug_pct:.1f}% of all issues") + print(f"Feature req rate: {feat_pct:.1f}% of all issues") + + print("\n" + "-"*70) + print("TOP 15 COMPONENTS BY ISSUE COUNT") + print("-"*70) + for component, count in cats["by_component"].most_common(15): + bugs = cats["bugs_by_component"].get(component, 0) + open_bugs = cats["open_bugs_by_component"].get(component, 0) + print(f"{component:25} {count:4} issues ({bugs:3} bugs, {open_bugs:3} open bugs)") + + print("\n" + "-"*70) + print("BUGS BY COMPONENT") + print("-"*70) + for component, count in cats["bugs_by_component"].most_common(15): + open_count = cats["open_bugs_by_component"].get(component, 0) + closed_count = count - open_count + print(f"{component:25} {count:4} bugs total ({open_count:3} open, {closed_count:3} closed)") + + print("\n" + "-"*70) + print("TOP LABELS") + print("-"*70) + for label, count in cats["by_label"].most_common(15): + print(f"{label:35} {count:4}") + + print("\n" + "-"*70) + print("ISSUES BY YEAR") + print("-"*70) + for year, count in sorted(cats["by_year"].items()): + print(f"{year:10} {count:4}") + + print("\n" + "-"*70) + print("CRITICAL ISSUES") + print("-"*70) + print(f"Crash issues: {len(analysis['crash_issues'])}") + print(f"Compiler errors: {len(analysis['compiler_errors'])}") + print(f"Code generation issues: {len(analysis['codegen_issues'])}") + + # Time to close analysis + print("\n" + "-"*70) + print("AVERAGE TIME TO CLOSE (days) BY COMPONENT") + print("-"*70) + time_to_close = analyze_time_to_close(issues) + for component, avg_days in sorted(time_to_close.items(), key=lambda x: x[1], reverse=True)[:15]: + issue_count = cats["by_component"].get(component, 0) + print(f"{component:25} {avg_days:6.1f} days ({issue_count} issues)") + + # Most commented issues (indicates complexity/difficulty) + print("\n" + "-"*70) + print("MOST DISCUSSED OPEN ISSUES") + print("-"*70) + open_issues = [i for i in issues if i.get("state") == "open"] + top_commented = sorted(open_issues, key=lambda x: x.get("comments", 0), reverse=True)[:10] + for issue in top_commented: + print(f"#{issue['number']:5} ({issue.get('comments', 0):3} comments) {issue['title'][:60]}") + + print("\n" + "="*70) + +def print_pr_report(pr_analysis: Dict[str, Any], prs: List[Dict[str, Any]]): + """Print PR analysis report.""" + if not pr_analysis: + print("\nNo PR data available. Run with --pr-files to get detailed PR analysis.") + return + + print("\n" + "="*70) + print("PULL REQUEST ANALYSIS") + print("="*70) + + print(f"\nTotal PRs analyzed: {len(prs)}") + print(f"Merged PRs: {pr_analysis['by_state'].get('closed', 0)}") + print(f"Open PRs: {pr_analysis['by_state'].get('open', 0)}") + + # PR Type Breakdown + print("\n" + "-"*70) + print("PR TYPE BREAKDOWN") + print("-"*70) + by_type = pr_analysis.get("by_type", {}) + total_classified = sum(by_type.values()) + if total_classified > 0: + bug_count = by_type.get("bug_fixes", 0) + other_count = by_type.get("other", 0) + + bug_pct = (bug_count / total_classified) * 100 + other_pct = (other_count / total_classified) * 100 + + print(f"Bug Fixes: {bug_count:4} PRs ({bug_pct:5.1f}%)") + print(f"Other: {other_count:4} PRs ({other_pct:5.1f}%)") + print(f"\nBug fix rate: {bug_pct:.1f}% of all PRs") + + # Test coverage by type + print("\n" + "-"*70) + print("TEST COVERAGE BY PR TYPE") + print("-"*70) + test_by_type = pr_analysis.get("test_coverage_by_type", {}) + for pr_type in ["bug_fixes", "other"]: + type_name = pr_type.replace("_", " ").title() + with_tests = test_by_type.get(pr_type, {}).get("with_tests", 0) + without_tests = test_by_type.get(pr_type, {}).get("without_tests", 0) + total = with_tests + without_tests + if total > 0: + pct = (with_tests / total) * 100 + print(f"{type_name:15} {with_tests:4} / {total:4} ({pct:5.1f}% with tests)") + + # Merge time stats + if pr_analysis.get("overall_avg_merge_time"): + print(f"\nAverage time to close PR: {pr_analysis['overall_avg_merge_time']:.1f} days") + print(f"Median time to close PR: {pr_analysis['median_merge_time']:.1f} days") + print(f"(Based on {pr_analysis['merged_pr_count']} closed PRs)") + + # File change stats + if pr_analysis.get("avg_files_per_pr"): + print(f"\nAverage files changed per PR: {pr_analysis['avg_files_per_pr']:.1f}") + + # Test coverage + test_cov = pr_analysis.get("test_coverage", {}) + total_with_files = test_cov.get("with_tests", 0) + test_cov.get("without_tests", 0) + if total_with_files > 0: + test_pct = (test_cov.get("with_tests", 0) / total_with_files) * 100 + print(f"\nPRs with test files: {test_cov.get('with_tests', 0)} / {total_with_files} ({test_pct:.1f}%)") + + print("\n" + "-"*70) + print("TOP 15 COMPONENTS BY PR COUNT") + print("-"*70) + for component, count in pr_analysis["by_component"].most_common(15): + avg_close = pr_analysis["avg_time_to_merge"].get(component) + if avg_close is not None: + print(f"{component:25} {count:4} PRs (avg {avg_close:.1f} days to close)") + else: + print(f"{component:25} {count:4} PRs") + + if pr_analysis["avg_time_to_merge"]: + print("\n" + "-"*70) + print("AVERAGE TIME TO CLOSE PR (days) BY COMPONENT") + print("-"*70) + for component, avg_days in sorted(pr_analysis["avg_time_to_merge"].items(), + key=lambda x: x[1], reverse=True)[:15]: + pr_count = pr_analysis["by_component"].get(component, 0) + print(f"{component:25} {avg_days:6.1f} days ({pr_count} PRs)") + + # Most changed files (if available) + if pr_analysis["most_changed_files"]: + print("\n" + "-"*70) + print("TOP 40 MOST FREQUENTLY CHANGED FILES (Hot Spots)") + print("-"*70) + for filename, count in pr_analysis["most_changed_files"].most_common(40): + print(f"{count:3}x {filename}") + + # Bug fix frequency analysis + if pr_analysis.get("bugfix_files") and pr_analysis.get("file_loc"): + print("\n" + "-"*70) + print("TOP 40 FILES BY BUG FIX FREQUENCY (bug fix PRs per 1000 LOC) - source/ only") + print("-"*70) + + # Calculate bug fix frequency for files with known LOC + bug_density = [] + for filename, bugfix_count in pr_analysis["bugfix_files"].items(): + loc = pr_analysis["file_loc"].get(filename) + if loc and loc > 0: + # Only include source files under source/ directory + if filename.startswith('source/') and filename.endswith(('.cpp', '.h', '.hpp', '.c')): + density = (bugfix_count / loc) * 1000 # bug fix PRs per 1000 LOC + bug_density.append((filename, bugfix_count, loc, density)) + + # Sort by density (highest first) + bug_density.sort(key=lambda x: x[3], reverse=True) + + for filename, bugfix_count, loc, density in bug_density[:40]: + print(f"{density:5.2f} {bugfix_count:3}x fixes {loc:6} LOC {filename}") + + print("\n" + "-"*70) + print("PRs BY YEAR") + print("-"*70) + for year, count in sorted(pr_analysis["by_year"].items()): + print(f"{year:10} {count:4}") + + print("\n" + "="*70) + +def export_detailed_csv(analysis: Dict[str, Any], issues: List[Dict[str, Any]]): + """Export detailed data to CSV for further analysis.""" + import csv + + output_file = DATA_DIR / "issues_detailed.csv" + with open(output_file, "w", newline="", encoding="utf-8") as f: + writer = csv.writer(f) + + # Check if issues have PR relationships + has_pr_links = any("related_prs" in issue for issue in issues) + + header = [ + "number", "state", "title", "created_at", "closed_at", + "comments", "labels", "components", "is_bug", "is_crash" + ] + if has_pr_links: + header.append("related_prs") + + writer.writerow(header) + + for issue in issues: + title = issue.get("title", "") + body = issue.get("body", "") or "" + combined = f"{title} {body}" + + keywords = extract_keywords(combined) + labels = [label["name"] for label in issue.get("labels", [])] + is_bug = any( + "bug" in label.lower() or + label.lower() in ["regression", "known_issues"] + for label in labels + ) + is_crash = "crash" in combined.lower() + + row = [ + issue.get("number", ""), + issue.get("state", ""), + title, + issue.get("created_at", ""), + issue.get("closed_at", ""), + issue.get("comments", 0), + "|".join(labels), + "|".join(keywords), + is_bug, + is_crash + ] + + if has_pr_links: + related_prs = issue.get("related_prs", []) + row.append("|".join(str(pr) for pr in related_prs)) + + writer.writerow(row) + + print(f"\nDetailed CSV exported to: {output_file}") + +def main(): + """Main entry point.""" + print("Loading issues...") + issues = load_issues() + + print("Loading PRs...") + prs = load_prs() + + print("Analyzing issues...") + analysis = categorize_issues(issues) + + print_report(analysis, issues) + + if prs: + print("\nAnalyzing PRs...") + pr_analysis = analyze_prs(prs) + print_pr_report(pr_analysis, prs) + + export_detailed_csv(analysis, issues) + + print("\n✓ Analysis complete!") + +if __name__ == "__main__": + main() + diff --git a/tools/issue-analysis/fetch_github_issues.py b/tools/issue-analysis/fetch_github_issues.py new file mode 100755 index 0000000000..1f1118f47d --- /dev/null +++ b/tools/issue-analysis/fetch_github_issues.py @@ -0,0 +1,398 @@ +#!/usr/bin/env python3 +""" +Fetch GitHub issues and PRs from shader-slang/slang repository. +Saves data locally for analysis. +""" + +import json +import os +import sys +from datetime import datetime +from pathlib import Path +from typing import Dict, List, Any, Tuple +import urllib.request +import urllib.error +import time +import ssl + +# Configuration +REPO_OWNER = "shader-slang" +REPO_NAME = "slang" +OUTPUT_DIR = Path(__file__).parent / "data" +GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN", "") + +# Create SSL context that works on macOS +def get_ssl_context(): + """Create SSL context for HTTPS requests.""" + try: + # Try to use certifi's certificates if available + import certifi + return ssl.create_default_context(cafile=certifi.where()) + except ImportError: + # certifi not available, try default + try: + return ssl.create_default_context() + except Exception: + pass + except Exception: + pass + + # Fallback: create unverified context (not ideal but works) + print("Warning: Using unverified SSL context (certificate verification disabled)") + return ssl._create_unverified_context() + +def make_github_request(url: str) -> Dict[str, Any]: + """Make a request to GitHub API with authentication if available.""" + headers = { + "Accept": "application/vnd.github.v3+json", + "User-Agent": "Slang-Issue-Analyzer" + } + if GITHUB_TOKEN: + headers["Authorization"] = f"token {GITHUB_TOKEN}" + + request = urllib.request.Request(url, headers=headers) + try: + ssl_context = get_ssl_context() + with urllib.request.urlopen(request, context=ssl_context) as response: + return json.loads(response.read().decode()) + except urllib.error.HTTPError as e: + print(f"HTTP Error {e.code}: {e.reason}") + if e.code == 403: + print("Rate limit likely exceeded. Set GITHUB_TOKEN environment variable.") + raise + +def fetch_all_pages(base_url: str, params: Dict[str, str], since: str = None) -> List[Dict[str, Any]]: + """Fetch all pages of results from GitHub API.""" + all_items = [] + page = 1 + per_page = 100 + + while True: + params_with_page = {**params, "page": str(page), "per_page": str(per_page)} + if since: + params_with_page["since"] = since + query_string = "&".join(f"{k}={v}" for k, v in params_with_page.items()) + url = f"{base_url}?{query_string}" + + print(f"Fetching page {page}...", end=" ", flush=True) + items = make_github_request(url) + + if not items: + print("Done!") + break + + all_items.extend(items) + print(f"Got {len(items)} items (total: {len(all_items)})") + + if len(items) < per_page: + break + + page += 1 + time.sleep(0.5) # Be nice to GitHub API + + return all_items + +def fetch_issues_and_prs(since: str = None): + """Fetch all issues and pull requests, optionally since a specific date.""" + base_url = f"https://api.github.com/repos/{REPO_OWNER}/{REPO_NAME}/issues" + + if since: + print("\n=== Fetching Updated Issues and Pull Requests (Incremental) ===") + print(f"Fetching items updated since: {since}") + else: + print("\n=== Fetching Issues and Pull Requests ===") + print("Note: GitHub API returns both issues and PRs in the /issues endpoint") + + # Fetch all (both open and closed) + all_items = [] + for state in ["open", "closed"]: + print(f"\nFetching {state} items...") + items = fetch_all_pages(base_url, {"state": state}, since=since) + all_items.extend(items) + + # Separate issues from PRs + issues = [item for item in all_items if "pull_request" not in item] + prs = [item for item in all_items if "pull_request" in item] + + print(f"\n=== Summary ===") + print(f"Total items fetched: {len(all_items)}") + print(f"Issues: {len(issues)}") + print(f"Pull Requests: {len(prs)}") + + return issues, prs + +def extract_issues_from_pr(pr: Dict[str, Any]) -> List[int]: + """ + Extract issue numbers referenced in PR title and body. + Looks for patterns like: fixes #123, closes #456, resolves #789 + + This matches GitHub's own issue linking behavior. + """ + import re + + title = pr.get("title", "") + body = pr.get("body") or "" + text = f"{title} {body}" + + # Pattern matches: fix/fixes/fixed/close/closes/closed/resolve/resolves/resolved #NUMBER + # Also matches bare #NUMBER references + pattern = r'(?:fix(?:es|ed)?|close(?:s|d)?|resolve(?:s|d)?)\s*#(\d+)|(?:^|\s)#(\d+)' + + issue_numbers = [] + for match in re.finditer(pattern, text, re.IGNORECASE): + # match.group(1) is from fix/close/resolve pattern + # match.group(2) is from bare # pattern + num = match.group(1) or match.group(2) + if num: + issue_numbers.append(int(num)) + + # Remove duplicates and sort + return sorted(list(set(issue_numbers))) + +def fetch_pr_files(pr_number: int) -> List[Dict[str, Any]]: + """Fetch list of files changed in a PR.""" + url = f"https://api.github.com/repos/{REPO_OWNER}/{REPO_NAME}/pulls/{pr_number}/files" + try: + files = fetch_all_pages(url, {}) + # Extract key information from each file + return [{ + "filename": f.get("filename"), + "status": f.get("status"), # "added", "removed", "modified", "renamed" + "additions": f.get("additions", 0), + "deletions": f.get("deletions", 0), + "changes": f.get("changes", 0), + } for f in files] + except Exception as e: + print(f"Error fetching files for PR #{pr_number}: {e}") + return [] + + +def load_existing_data() -> Tuple[List[Dict], List[Dict], Dict]: + """Load existing data if available.""" + issues_file = OUTPUT_DIR / "issues.json" + prs_file = OUTPUT_DIR / "pull_requests.json" + metadata_file = OUTPUT_DIR / "metadata.json" + + issues = [] + prs = [] + metadata = {} + + if issues_file.exists(): + with open(issues_file, "r") as f: + issues = json.load(f) + + if prs_file.exists(): + with open(prs_file, "r") as f: + prs = json.load(f) + + if metadata_file.exists(): + with open(metadata_file, "r") as f: + metadata = json.load(f) + + return issues, prs, metadata + +def merge_data(existing: List[Dict], new: List[Dict]) -> List[Dict]: + """Merge new data with existing data, avoiding duplicates.""" + # Create a dictionary keyed by issue/PR number for fast lookup + merged = {item["number"]: item for item in existing} + + # Update or add new items + for item in new: + merged[item["number"]] = item + + # Return as list, sorted by number + return sorted(merged.values(), key=lambda x: x["number"]) + +def save_data(issues: List[Dict], prs: List[Dict], enrichments: Dict[str, bool] = None): + """Save fetched data to JSON files.""" + OUTPUT_DIR.mkdir(parents=True, exist_ok=True) + + timestamp = datetime.now().isoformat() + metadata = { + "fetched_at": timestamp, + "repo": f"{REPO_OWNER}/{REPO_NAME}", + "issue_count": len(issues), + "pr_count": len(prs), + "enrichments": enrichments or {} + } + + # Save issues + issues_file = OUTPUT_DIR / "issues.json" + with open(issues_file, "w") as f: + json.dump(issues, f, indent=2) + print(f"\nSaved {len(issues)} issues to {issues_file}") + + # Save PRs + prs_file = OUTPUT_DIR / "pull_requests.json" + with open(prs_file, "w") as f: + json.dump(prs, f, indent=2) + print(f"Saved {len(prs)} pull requests to {prs_file}") + + # Save metadata + metadata_file = OUTPUT_DIR / "metadata.json" + with open(metadata_file, "w") as f: + json.dump(metadata, f, indent=2) + print(f"Saved metadata to {metadata_file}") + +def add_issue_references_to_prs(prs: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """ + Extract issue references from PR titles and bodies. + This is fast (no API calls) and matches GitHub's linking behavior. + """ + print("\n=== Extracting Issue References from PRs ===") + + total_refs = 0 + for pr in prs: + issue_refs = extract_issues_from_pr(pr) + pr["referenced_issues"] = issue_refs + total_refs += len(issue_refs) + + print(f"✓ Found {total_refs} issue references across {len(prs)} PRs") + return prs + +def enrich_prs_with_files(prs: List[Dict[str, Any]], only_new: bool = False) -> List[Dict[str, Any]]: + """ + Enrich PRs with file change information. + WARNING: This makes additional API calls and can be slow! + Only run this for detailed analysis. + + Args: + prs: List of PRs to enrich + only_new: If True, only enrich PRs that don't already have files_changed + """ + # Filter to only PRs that need enrichment + if only_new: + prs_to_enrich = [pr for pr in prs if "files_changed" not in pr] + already_enriched = len(prs) - len(prs_to_enrich) + else: + prs_to_enrich = prs + already_enriched = 0 + + if not prs_to_enrich: + print("\n=== All PRs Already Have File Changes ===") + return prs + + print("\n=== Enriching PRs with File Changes ===") + print(f"PRs to enrich: {len(prs_to_enrich)}") + if already_enriched > 0: + print(f"Already enriched: {already_enriched}") + print("This may take a while and consume API rate limit.") + + # Create a mapping for quick lookup + pr_map = {pr["number"]: pr for pr in prs} + + for i, pr in enumerate(prs_to_enrich): + if (i + 1) % 50 == 0: + print(f"Processed {i + 1}/{len(prs_to_enrich)} PRs...") + + # Fetch files changed in this PR + files = fetch_pr_files(pr["number"]) + pr_map[pr["number"]]["files_changed"] = files + + # Rate limiting protection + if (i + 1) % 100 == 0: + print("Pausing to respect rate limits...") + time.sleep(2) + + print(f"✓ Enriched {len(prs_to_enrich)} PRs (skipped {already_enriched} already enriched)") + return list(pr_map.values()) + +def main(): + """Main entry point.""" + import argparse + + parser = argparse.ArgumentParser(description="Fetch Slang GitHub issues and PRs") + parser.add_argument( + "--incremental", + action="store_true", + help="Incremental update: fetch only items updated since last fetch" + ) + args = parser.parse_args() + + if not GITHUB_TOKEN: + print("WARNING: GITHUB_TOKEN environment variable not set.") + print("You may hit rate limits quickly without authentication.") + print("Create a token at: https://github.com/settings/tokens") + response = input("\nContinue anyway? (y/N): ") + if response.lower() != 'y': + sys.exit(1) + + print(f"\nFetching data from {REPO_OWNER}/{REPO_NAME}") + print(f"Output directory: {OUTPUT_DIR}\n") + + try: + # Check for incremental update + since = None + existing_issues = [] + existing_prs = [] + previous_enrichments = {} + + if args.incremental: + existing_issues, existing_prs, metadata = load_existing_data() + if metadata and "fetched_at" in metadata: + since = metadata["fetched_at"] + previous_enrichments = metadata.get("enrichments", {}) + + print(f"=== Incremental Update Mode ===") + print(f"Existing data: {len(existing_issues)} issues, {len(existing_prs)} PRs") + print(f"Last fetch: {since}") + + # Show previous enrichments info + if previous_enrichments: + print(f"Previous enrichments detected: {', '.join(k for k, v in previous_enrichments.items() if v)}") + + print(f"Fetching updates since then...\n") + else: + print("No existing data found. Performing full fetch.") + args.incremental = False + + # Fetch new/updated data + new_issues, new_prs = fetch_issues_and_prs(since=since if args.incremental else None) + + # Merge with existing data if incremental + if args.incremental and existing_issues: + print(f"\n=== Merging Data ===") + print(f"New items fetched: {len(new_issues)} issues, {len(new_prs)} PRs") + issues = merge_data(existing_issues, new_issues) + prs = merge_data(existing_prs, new_prs) + print(f"After merge: {len(issues)} issues, {len(prs)} PRs") + else: + issues = new_issues + prs = new_prs + + # Always add issue references from PRs (fast, no API calls!) + prs = add_issue_references_to_prs(prs) + + # Always enrich PRs with file changes (only new ones in incremental mode) + if args.incremental and existing_prs: + # In incremental mode, only enrich PRs that don't have files yet + print(f"\n→ Smart enrichment: Only fetching files for new/updated PRs") + prs = enrich_prs_with_files(prs, only_new=True) + else: + prs = enrich_prs_with_files(prs, only_new=False) + + # Track which enrichments were applied + enrichments = { + "pr_files": True, # Always included now + "issue_references": True # Always included now + } + + save_data(issues, prs, enrichments) + print("\n✓ Data fetch complete!") + + if args.incremental: + print("\nIncremental update successful! Data merged with existing.") + + print("\nData enrichments:") + print(" ✓ PRs include 'referenced_issues' field (issue numbers from PR title/body)") + print(" ✓ PRs include 'files_changed' field (detailed file change information)") + + except Exception as e: + print(f"\n✗ Error: {e}") + import traceback + traceback.print_exc() + sys.exit(1) + +if __name__ == "__main__": + main() +