From e14ab36adb5b23b50e09327b33ad5be8598dfef8 Mon Sep 17 00:00:00 2001 From: "snowpoke (aider)" Date: Mon, 22 Sep 2025 12:49:19 +0200 Subject: [PATCH 01/36] feat: add script to check repository metrics via GitHub GraphQL API --- metrics_check.sh | 106 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 metrics_check.sh diff --git a/metrics_check.sh b/metrics_check.sh new file mode 100644 index 0000000..04373f6 --- /dev/null +++ b/metrics_check.sh @@ -0,0 +1,106 @@ +#!/bin/bash + +# Check if required environment variables are set +if [ -z "$GITHUB_TOKEN" ]; then + echo "Error: GITHUB_TOKEN environment variable is not set." + exit 1 +fi + +if [ -z "$OWNER" ]; then + echo "Error: OWNER environment variable is not set." + exit 1 +fi + +if [ -z "$REPO_NAME" ]; then + echo "Error: REPO_NAME environment variable is not set." + exit 1 +fi + +# Calculate the date one year ago in ISO 8601 format +SINCE_DATE=$(date -u -d "365 days ago" +"%Y-%m-%dT%H:%M:%SZ" 2>/dev/null || date -v-365d -u +"%Y-%m-%dT%H:%M:%SZ") + +echo "Checking metrics for repository: $OWNER/$REPO_NAME" +echo "==================================================" + +# 1. Check for README and OSI-Approved License +echo "1. Checking for README and OSI-approved license..." +README_LICENSE_RESPONSE=$(curl -s -H "Authorization: bearer $GITHUB_TOKEN" \ + -H "Content-Type: application/json" \ + -d "{ + \"query\": \"query { repository(owner: \\\"$OWNER\\\", name: \\\"$REPO_NAME\\\") { readme { name } licenseInfo { key name } } }\" + }" https://api.github.com/graphql) + +# Parse the response +HAS_README=$(echo "$README_LICENSE_RESPONSE" | grep -q '"readme":{' && echo "true" || echo "false") +HAS_LICENSE=$(echo "$README_LICENSE_RESPONSE" | grep -q '"licenseInfo":{' && echo "true" || echo "false") + +if [ "$HAS_README" = "true" ] && [ "$HAS_LICENSE" = "true" ]; then + echo " README and OSI-approved license: PASS" +else + echo " README and OSI-approved license: FAIL" +fi + +# 2. Check for at least one release in the last year +echo "2. Checking for releases in the last year..." +RELEASE_RESPONSE=$(curl -s -H "Authorization: bearer $GITHUB_TOKEN" \ + -H "Content-Type: application/json" \ + -d "{ + \"query\": \"query { repository(owner: \\\"$OWNER\\\", name: \\\"$REPO_NAME\\\") { releases(last: 1, orderBy: {field: CREATED_AT, direction: DESC}) { edges { node { publishedAt } } } } }\" + }" https://api.github.com/graphql) + +# Check if the most recent release is within the last year +LATEST_RELEASE_DATE=$(echo "$RELEASE_RESPONSE" | grep -o '"publishedAt":"[^"]*"' | cut -d'"' -f4) +if [ -n "$LATEST_RELEASE_DATE" ]; then + RELEASE_EPOCH=$(date -d "$LATEST_RELEASE_DATE" +%s 2>/dev/null || date -j -f "%Y-%m-%dT%H:%M:%SZ" "$LATEST_RELEASE_DATE" +%s) + YEAR_AGO_EPOCH=$(date -d "$SINCE_DATE" +%s 2>/dev/null || date -j -f "%Y-%m-%dT%H:%M:%SZ" "$SINCE_DATE" +%s) + if [ "$RELEASE_EPOCH" -gt "$YEAR_AGO_EPOCH" ]; then + echo " Release in last year: yes" + else + echo " Release in last year: no" + fi +else + echo " Release in last year: no" +fi + +# 3. Count active contributors (last 12 months) +echo "3. Counting active contributors in the last 12 months..." +CONTRIBUTORS_RESPONSE=$(curl -s -H "Authorization: bearer $GITHUB_TOKEN" \ + -H "Content-Type: application/json" \ + -d "{ + \"query\": \"query { repository(owner: \\\"$OWNER\\\", name: \\\"$REPO_NAME\\\") { defaultBranchRef { target { ... on Commit { history(since: \\\"$SINCE_DATE\\\") { nodes { author { user { login } } } } } } } } }\" + }" https://api.github.com/graphql) + +# Extract and count unique logins +UNIQUE_CONTRIBUTORS=$(echo "$CONTRIBUTORS_RESPONSE" | grep -o '"login":"[^"]*"' | sort | uniq | wc -l) +if [ "$UNIQUE_CONTRIBUTORS" -gt 3 ]; then + echo " Number of active contributors: above 3 ($UNIQUE_CONTRIBUTORS)" +else + echo " Number of active contributors: below 3 ($UNIQUE_CONTRIBUTORS)" +fi + +# 4. Count commits per month (last 12 months) +echo "4. Calculating average commits per month..." +COMMITS_RESPONSE=$(curl -s -H "Authorization: bearer $GITHUB_TOKEN" \ + -H "Content-Type: application/json" \ + -d "{ + \"query\": \"query { repository(owner: \\\"$OWNER\\\", name: \\\"$REPO_NAME\\\") { defaultBranchRef { target { ... on Commit { history(since: \\\"$SINCE_DATE\\\") { totalCount } } } } } }\" + }" https://api.github.com/graphql) + +TOTAL_COMMITS=$(echo "$COMMITS_RESPONSE" | grep -o '"totalCount":[0-9]*' | cut -d':' -f2) +AVG_COMMITS_PER_MONTH=$((TOTAL_COMMITS / 12)) +if [ "$AVG_COMMITS_PER_MONTH" -gt 2 ]; then + echo " Average commits per month: above 2 ($AVG_COMMITS_PER_MONTH)" +else + echo " Average commits per month: below 2 ($AVG_COMMITS_PER_MONTH)" +fi + +# 5. Count open issues +echo "5. Counting open issues..." +ISSUES_RESPONSE=$(curl -s -H "Authorization: bearer $GITHUB_TOKEN" \ + -H "Content-Type: application/json" \ + -d "{ + \"query\": \"query { repository(owner: \\\"$OWNER\\\", name: \\\"$REPO_NAME\\\") { issues(states: OPEN) { totalCount } } }\" + }" https://api.github.com/graphql) + +OPEN_ISSUES=$(echo "$ISSUES_RESPONSE" | grep -o '"totalCount":[0-9]*' | cut -d':' -f2) +echo " Number of open issues: $OPEN_ISSUES" From 461c5fcd0e1cca5c3c829887383f244b381ca4dc Mon Sep 17 00:00:00 2001 From: snowpoke Date: Mon, 22 Sep 2025 12:54:50 +0200 Subject: [PATCH 02/36] chore: update permissions for metrics_check.sh script --- metrics_check.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 metrics_check.sh diff --git a/metrics_check.sh b/metrics_check.sh old mode 100644 new mode 100755 From c6ad63dab2994d52f34bf46ca52402fb9344db57 Mon Sep 17 00:00:00 2001 From: "snowpoke (aider)" Date: Mon, 22 Sep 2025 12:54:51 +0200 Subject: [PATCH 03/36] fix: make bash script more verbose with detailed GraphQL queries, responses, and computations shown at each step --- metrics_check.sh | 78 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 53 insertions(+), 25 deletions(-) diff --git a/metrics_check.sh b/metrics_check.sh index 04373f6..c8d92d6 100755 --- a/metrics_check.sh +++ b/metrics_check.sh @@ -18,89 +18,117 @@ fi # Calculate the date one year ago in ISO 8601 format SINCE_DATE=$(date -u -d "365 days ago" +"%Y-%m-%dT%H:%M:%SZ" 2>/dev/null || date -v-365d -u +"%Y-%m-%dT%H:%M:%SZ") +echo "Since date: $SINCE_DATE" echo "Checking metrics for repository: $OWNER/$REPO_NAME" echo "==================================================" # 1. Check for README and OSI-Approved License echo "1. Checking for README and OSI-approved license..." +QUERY1='{ + "query": "query { repository(owner: \"'"$OWNER"'\", name: \"'"$REPO_NAME"'\") { readme { name } licenseInfo { key name } } }" +}' +echo " Query: $QUERY1" README_LICENSE_RESPONSE=$(curl -s -H "Authorization: bearer $GITHUB_TOKEN" \ -H "Content-Type: application/json" \ - -d "{ - \"query\": \"query { repository(owner: \\\"$OWNER\\\", name: \\\"$REPO_NAME\\\") { readme { name } licenseInfo { key name } } }\" - }" https://api.github.com/graphql) + -d "$QUERY1" https://api.github.com/graphql) +echo " Raw response: $README_LICENSE_RESPONSE" # Parse the response HAS_README=$(echo "$README_LICENSE_RESPONSE" | grep -q '"readme":{' && echo "true" || echo "false") HAS_LICENSE=$(echo "$README_LICENSE_RESPONSE" | grep -q '"licenseInfo":{' && echo "true" || echo "false") +echo " Has README: $HAS_README" +echo " Has License: $HAS_LICENSE" if [ "$HAS_README" = "true" ] && [ "$HAS_LICENSE" = "true" ]; then - echo " README and OSI-approved license: PASS" + echo " Result: README and OSI-approved license: PASS" else - echo " README and OSI-approved license: FAIL" + echo " Result: README and OSI-approved license: FAIL" fi +echo # 2. Check for at least one release in the last year echo "2. Checking for releases in the last year..." +QUERY2='{ + "query": "query { repository(owner: \"'"$OWNER"'\", name: \"'"$REPO_NAME"'\") { releases(last: 1, orderBy: {field: CREATED_AT, direction: DESC}) { edges { node { publishedAt } } } } }" +}' +echo " Query: $QUERY2" RELEASE_RESPONSE=$(curl -s -H "Authorization: bearer $GITHUB_TOKEN" \ -H "Content-Type: application/json" \ - -d "{ - \"query\": \"query { repository(owner: \\\"$OWNER\\\", name: \\\"$REPO_NAME\\\") { releases(last: 1, orderBy: {field: CREATED_AT, direction: DESC}) { edges { node { publishedAt } } } } }\" - }" https://api.github.com/graphql) + -d "$QUERY2" https://api.github.com/graphql) +echo " Raw response: $RELEASE_RESPONSE" # Check if the most recent release is within the last year LATEST_RELEASE_DATE=$(echo "$RELEASE_RESPONSE" | grep -o '"publishedAt":"[^"]*"' | cut -d'"' -f4) +echo " Latest release date: $LATEST_RELEASE_DATE" if [ -n "$LATEST_RELEASE_DATE" ]; then RELEASE_EPOCH=$(date -d "$LATEST_RELEASE_DATE" +%s 2>/dev/null || date -j -f "%Y-%m-%dT%H:%M:%SZ" "$LATEST_RELEASE_DATE" +%s) YEAR_AGO_EPOCH=$(date -d "$SINCE_DATE" +%s 2>/dev/null || date -j -f "%Y-%m-%dT%H:%M:%SZ" "$SINCE_DATE" +%s) + echo " Release epoch: $RELEASE_EPOCH" + echo " Year ago epoch: $YEAR_AGO_EPOCH" if [ "$RELEASE_EPOCH" -gt "$YEAR_AGO_EPOCH" ]; then - echo " Release in last year: yes" + echo " Result: Release in last year: yes" else - echo " Release in last year: no" + echo " Result: Release in last year: no" fi else - echo " Release in last year: no" + echo " Result: Release in last year: no" fi +echo # 3. Count active contributors (last 12 months) echo "3. Counting active contributors in the last 12 months..." +QUERY3='{ + "query": "query { repository(owner: \"'"$OWNER"'\", name: \"'"$REPO_NAME"'\") { defaultBranchRef { target { ... on Commit { history(since: \"'"$SINCE_DATE"'\") { nodes { author { user { login } } } } } } } } }" +}' +echo " Query: $QUERY3" CONTRIBUTORS_RESPONSE=$(curl -s -H "Authorization: bearer $GITHUB_TOKEN" \ -H "Content-Type: application/json" \ - -d "{ - \"query\": \"query { repository(owner: \\\"$OWNER\\\", name: \\\"$REPO_NAME\\\") { defaultBranchRef { target { ... on Commit { history(since: \\\"$SINCE_DATE\\\") { nodes { author { user { login } } } } } } } } }\" - }" https://api.github.com/graphql) + -d "$QUERY3" https://api.github.com/graphql) +echo " Raw response: $CONTRIBUTORS_RESPONSE" # Extract and count unique logins UNIQUE_CONTRIBUTORS=$(echo "$CONTRIBUTORS_RESPONSE" | grep -o '"login":"[^"]*"' | sort | uniq | wc -l) +echo " Unique contributors count: $UNIQUE_CONTRIBUTORS" if [ "$UNIQUE_CONTRIBUTORS" -gt 3 ]; then - echo " Number of active contributors: above 3 ($UNIQUE_CONTRIBUTORS)" + echo " Result: Number of active contributors: above 3 ($UNIQUE_CONTRIBUTORS)" else - echo " Number of active contributors: below 3 ($UNIQUE_CONTRIBUTORS)" + echo " Result: Number of active contributors: below 3 ($UNIQUE_CONTRIBUTORS)" fi +echo # 4. Count commits per month (last 12 months) echo "4. Calculating average commits per month..." +QUERY4='{ + "query": "query { repository(owner: \"'"$OWNER"'\", name: \"'"$REPO_NAME"'\") { defaultBranchRef { target { ... on Commit { history(since: \"'"$SINCE_DATE"'\") { totalCount } } } } } }" +}' +echo " Query: $QUERY4" COMMITS_RESPONSE=$(curl -s -H "Authorization: bearer $GITHUB_TOKEN" \ -H "Content-Type: application/json" \ - -d "{ - \"query\": \"query { repository(owner: \\\"$OWNER\\\", name: \\\"$REPO_NAME\\\") { defaultBranchRef { target { ... on Commit { history(since: \\\"$SINCE_DATE\\\") { totalCount } } } } } }\" - }" https://api.github.com/graphql) + -d "$QUERY4" https://api.github.com/graphql) +echo " Raw response: $COMMITS_RESPONSE" TOTAL_COMMITS=$(echo "$COMMITS_RESPONSE" | grep -o '"totalCount":[0-9]*' | cut -d':' -f2) +echo " Total commits: $TOTAL_COMMITS" AVG_COMMITS_PER_MONTH=$((TOTAL_COMMITS / 12)) +echo " Average commits per month: $AVG_COMMITS_PER_MONTH" if [ "$AVG_COMMITS_PER_MONTH" -gt 2 ]; then - echo " Average commits per month: above 2 ($AVG_COMMITS_PER_MONTH)" + echo " Result: Average commits per month: above 2 ($AVG_COMMITS_PER_MONTH)" else - echo " Average commits per month: below 2 ($AVG_COMMITS_PER_MONTH)" + echo " Result: Average commits per month: below 2 ($AVG_COMMITS_PER_MONTH)" fi +echo # 5. Count open issues echo "5. Counting open issues..." +QUERY5='{ + "query": "query { repository(owner: \"'"$OWNER"'\", name: \"'"$REPO_NAME"'\") { issues(states: OPEN) { totalCount } } }" +}' +echo " Query: $QUERY5" ISSUES_RESPONSE=$(curl -s -H "Authorization: bearer $GITHUB_TOKEN" \ -H "Content-Type: application/json" \ - -d "{ - \"query\": \"query { repository(owner: \\\"$OWNER\\\", name: \\\"$REPO_NAME\\\") { issues(states: OPEN) { totalCount } } }\" - }" https://api.github.com/graphql) + -d "$QUERY5" https://api.github.com/graphql) +echo " Raw response: $ISSUES_RESPONSE" OPEN_ISSUES=$(echo "$ISSUES_RESPONSE" | grep -o '"totalCount":[0-9]*' | cut -d':' -f2) -echo " Number of open issues: $OPEN_ISSUES" +echo " Result: Number of open issues: $OPEN_ISSUES" From f9a09d73a09092c4837844fe8411547e99d8f55b Mon Sep 17 00:00:00 2001 From: "snowpoke (aider)" Date: Mon, 22 Sep 2025 13:11:03 +0200 Subject: [PATCH 04/36] refactor: Update script to gather comprehensive repository metrics including .md files, license, releases, contributors, commits, and issues --- metrics_check.sh | 154 +++++++++++++++++++++++++---------------------- 1 file changed, 81 insertions(+), 73 deletions(-) diff --git a/metrics_check.sh b/metrics_check.sh index c8d92d6..916beb9 100755 --- a/metrics_check.sh +++ b/metrics_check.sh @@ -23,112 +23,120 @@ echo "Since date: $SINCE_DATE" echo "Checking metrics for repository: $OWNER/$REPO_NAME" echo "==================================================" -# 1. Check for README and OSI-Approved License -echo "1. Checking for README and OSI-approved license..." +# 1. List all .md files in the root folder +echo "1. Listing all .md files in the root folder..." QUERY1='{ - "query": "query { repository(owner: \"'"$OWNER"'\", name: \"'"$REPO_NAME"'\") { readme { name } licenseInfo { key name } } }" + "query": "query { repository(owner: \"'"$OWNER"'\", name: \"'"$REPO_NAME"'\") { object(expression: \"HEAD:\") { ... on Tree { entries { name } } } } }" }' echo " Query: $QUERY1" -README_LICENSE_RESPONSE=$(curl -s -H "Authorization: bearer $GITHUB_TOKEN" \ +ROOT_FILES_RESPONSE=$(curl -s -H "Authorization: bearer $GITHUB_TOKEN" \ -H "Content-Type: application/json" \ -d "$QUERY1" https://api.github.com/graphql) -echo " Raw response: $README_LICENSE_RESPONSE" - -# Parse the response -HAS_README=$(echo "$README_LICENSE_RESPONSE" | grep -q '"readme":{' && echo "true" || echo "false") -HAS_LICENSE=$(echo "$README_LICENSE_RESPONSE" | grep -q '"licenseInfo":{' && echo "true" || echo "false") -echo " Has README: $HAS_README" -echo " Has License: $HAS_LICENSE" - -if [ "$HAS_README" = "true" ] && [ "$HAS_LICENSE" = "true" ]; then - echo " Result: README and OSI-approved license: PASS" -else - echo " Result: README and OSI-approved license: FAIL" -fi +echo " Raw response: $ROOT_FILES_RESPONSE" + +# Extract .md files +echo " .md files in root:" +echo "$ROOT_FILES_RESPONSE" | grep -o '"name":"[^"]*"' | cut -d'"' -f4 | grep '\.md$' | while read -r file; do + echo " - $file" +done echo -# 2. Check for at least one release in the last year -echo "2. Checking for releases in the last year..." +# 2. Get license name +echo "2. Getting license name..." QUERY2='{ - "query": "query { repository(owner: \"'"$OWNER"'\", name: \"'"$REPO_NAME"'\") { releases(last: 1, orderBy: {field: CREATED_AT, direction: DESC}) { edges { node { publishedAt } } } } }" + "query": "query { repository(owner: \"'"$OWNER"'\", name: \"'"$REPO_NAME"'\") { licenseInfo { name } } }" }' echo " Query: $QUERY2" -RELEASE_RESPONSE=$(curl -s -H "Authorization: bearer $GITHUB_TOKEN" \ +LICENSE_RESPONSE=$(curl -s -H "Authorization: bearer $GITHUB_TOKEN" \ -H "Content-Type: application/json" \ -d "$QUERY2" https://api.github.com/graphql) -echo " Raw response: $RELEASE_RESPONSE" - -# Check if the most recent release is within the last year -LATEST_RELEASE_DATE=$(echo "$RELEASE_RESPONSE" | grep -o '"publishedAt":"[^"]*"' | cut -d'"' -f4) -echo " Latest release date: $LATEST_RELEASE_DATE" -if [ -n "$LATEST_RELEASE_DATE" ]; then - RELEASE_EPOCH=$(date -d "$LATEST_RELEASE_DATE" +%s 2>/dev/null || date -j -f "%Y-%m-%dT%H:%M:%SZ" "$LATEST_RELEASE_DATE" +%s) - YEAR_AGO_EPOCH=$(date -d "$SINCE_DATE" +%s 2>/dev/null || date -j -f "%Y-%m-%dT%H:%M:%SZ" "$SINCE_DATE" +%s) - echo " Release epoch: $RELEASE_EPOCH" - echo " Year ago epoch: $YEAR_AGO_EPOCH" - if [ "$RELEASE_EPOCH" -gt "$YEAR_AGO_EPOCH" ]; then - echo " Result: Release in last year: yes" - else - echo " Result: Release in last year: no" - fi -else - echo " Result: Release in last year: no" -fi +echo " Raw response: $LICENSE_RESPONSE" + +# Extract license name +LICENSE_NAME=$(echo "$LICENSE_RESPONSE" | grep -o '"name":"[^"]*"' | cut -d'"' -f4) +echo " License name: ${LICENSE_NAME:-None}" echo -# 3. Count active contributors (last 12 months) -echo "3. Counting active contributors in the last 12 months..." +# 3. List all releases with timestamps +echo "3. Listing all releases with timestamps..." QUERY3='{ - "query": "query { repository(owner: \"'"$OWNER"'\", name: \"'"$REPO_NAME"'\") { defaultBranchRef { target { ... on Commit { history(since: \"'"$SINCE_DATE"'\") { nodes { author { user { login } } } } } } } } }" + "query": "query { repository(owner: \"'"$OWNER"'\", name: \"'"$REPO_NAME"'\") { releases(last: 100, orderBy: {field: CREATED_AT, direction: DESC}) { edges { node { name publishedAt } } } } }" }' echo " Query: $QUERY3" -CONTRIBUTORS_RESPONSE=$(curl -s -H "Authorization: bearer $GITHUB_TOKEN" \ +RELEASES_RESPONSE=$(curl -s -H "Authorization: bearer $GITHUB_TOKEN" \ -H "Content-Type: application/json" \ -d "$QUERY3" https://api.github.com/graphql) -echo " Raw response: $CONTRIBUTORS_RESPONSE" - -# Extract and count unique logins -UNIQUE_CONTRIBUTORS=$(echo "$CONTRIBUTORS_RESPONSE" | grep -o '"login":"[^"]*"' | sort | uniq | wc -l) -echo " Unique contributors count: $UNIQUE_CONTRIBUTORS" -if [ "$UNIQUE_CONTRIBUTORS" -gt 3 ]; then - echo " Result: Number of active contributors: above 3 ($UNIQUE_CONTRIBUTORS)" -else - echo " Result: Number of active contributors: below 3 ($UNIQUE_CONTRIBUTORS)" -fi +echo " Raw response: $RELEASES_RESPONSE" + +# Extract releases +echo " Releases:" +echo "$RELEASES_RESPONSE" | grep -E '"name":|"publishedAt":' | while read -r line1 && read -r line2; do + name=$(echo "$line1" | grep -o '"name":"[^"]*"' | cut -d'"' -f4) + date=$(echo "$line2" | grep -o '"publishedAt":"[^"]*"' | cut -d'"' -f4) + echo " - $name: $date" +done echo -# 4. Count commits per month (last 12 months) -echo "4. Calculating average commits per month..." +# 4. List all contributors with their most recent contribution date +echo "4. Listing all contributors with their most recent contribution date..." QUERY4='{ - "query": "query { repository(owner: \"'"$OWNER"'\", name: \"'"$REPO_NAME"'\") { defaultBranchRef { target { ... on Commit { history(since: \"'"$SINCE_DATE"'\") { totalCount } } } } } }" + "query": "query { repository(owner: \"'"$OWNER"'\", name: \"'"$REPO_NAME"'\") { defaultBranchRef { target { ... on Commit { history(first: 100) { nodes { author { user { login } } committedDate } } } } } } }" }' echo " Query: $QUERY4" -COMMITS_RESPONSE=$(curl -s -H "Authorization: bearer $GITHUB_TOKEN" \ +CONTRIBUTORS_RESPONSE=$(curl -s -H "Authorization: bearer $GITHUB_TOKEN" \ -H "Content-Type: application/json" \ -d "$QUERY4" https://api.github.com/graphql) -echo " Raw response: $COMMITS_RESPONSE" +echo " Raw response: $CONTRIBUTORS_RESPONSE" -TOTAL_COMMITS=$(echo "$COMMITS_RESPONSE" | grep -o '"totalCount":[0-9]*' | cut -d':' -f2) -echo " Total commits: $TOTAL_COMMITS" -AVG_COMMITS_PER_MONTH=$((TOTAL_COMMITS / 12)) -echo " Average commits per month: $AVG_COMMITS_PER_MONTH" -if [ "$AVG_COMMITS_PER_MONTH" -gt 2 ]; then - echo " Result: Average commits per month: above 2 ($AVG_COMMITS_PER_MONTH)" -else - echo " Result: Average commits per month: below 2 ($AVG_COMMITS_PER_MONTH)" -fi +# Extract contributors and their latest commit dates +echo " Contributors and their most recent contribution:" +# This is a simplified approach - in practice, you'd want to process this more carefully +echo "$CONTRIBUTORS_RESPONSE" | grep -E '"login":|"committedDate":' | while read -r line1 && read -r line2; do + login=$(echo "$line1" | grep -o '"login":"[^"]*"' | cut -d'"' -f4) + date=$(echo "$line2" | grep -o '"committedDate":"[^"]*"' | cut -d'"' -f4) + if [ -n "$login" ] && [ -n "$date" ]; then + echo " - $login: $date" + fi +done echo -# 5. Count open issues -echo "5. Counting open issues..." +# 5. List all commits +echo "5. Listing all commits..." QUERY5='{ - "query": "query { repository(owner: \"'"$OWNER"'\", name: \"'"$REPO_NAME"'\") { issues(states: OPEN) { totalCount } } }" + "query": "query { repository(owner: \"'"$OWNER"'\", name: \"'"$REPO_NAME"'\") { defaultBranchRef { target { ... on Commit { history(first: 100) { nodes { messageHeadline committedDate author { name } } } } } } } }" }' echo " Query: $QUERY5" -ISSUES_RESPONSE=$(curl -s -H "Authorization: bearer $GITHUB_TOKEN" \ +COMMITS_RESPONSE=$(curl -s -H "Authorization: bearer $GITHUB_TOKEN" \ -H "Content-Type: application/json" \ -d "$QUERY5" https://api.github.com/graphql) +echo " Raw response: $COMMITS_RESPONSE" + +# Extract commits +echo " Commits:" +echo "$COMMITS_RESPONSE" | grep -E '"messageHeadline":|"committedDate":|"name":' | while read -r line1 && read -r line2 && read -r line3; do + message=$(echo "$line1" | grep -o '"messageHeadline":"[^"]*"' | cut -d'"' -f4) + date=$(echo "$line2" | grep -o '"committedDate":"[^"]*"' | cut -d'"' -f4) + author=$(echo "$line3" | grep -o '"name":"[^"]*"' | cut -d'"' -f4) + echo " - $date: $author - $message" +done +echo + +# 6. List all issues with creator and status +echo "6. Listing all issues with creator and status..." +QUERY6='{ + "query": "query { repository(owner: \"'"$OWNER"'\", name: \"'"$REPO_NAME"'\") { issues(first: 100, states: [OPEN, CLOSED]) { nodes { title state author { login } } } } }" +}' +echo " Query: $QUERY6" +ISSUES_RESPONSE=$(curl -s -H "Authorization: bearer $GITHUB_TOKEN" \ + -H "Content-Type: application/json" \ + -d "$QUERY6" https://api.github.com/graphql) echo " Raw response: $ISSUES_RESPONSE" -OPEN_ISSUES=$(echo "$ISSUES_RESPONSE" | grep -o '"totalCount":[0-9]*' | cut -d':' -f2) -echo " Result: Number of open issues: $OPEN_ISSUES" +# Extract issues +echo " Issues:" +echo "$ISSUES_RESPONSE" | grep -E '"title":|"state":|"login":' | while read -r line1 && read -r line2 && read -r line3; do + title=$(echo "$line1" | grep -o '"title":"[^"]*"' | cut -d'"' -f4) + state=$(echo "$line2" | grep -o '"state":"[^"]*"' | cut -d'"' -f4) + author=$(echo "$line3" | grep -o '"login":"[^"]*"' | cut -d'"' -f4) + echo " - $state: $author - $title" +done From 8717274e2ed8dc6c6489e70953ebd57f97b4305d Mon Sep 17 00:00:00 2001 From: "snowpoke (aider)" Date: Mon, 22 Sep 2025 15:17:15 +0200 Subject: [PATCH 05/36] feat: Rewrite metrics check script in Python with GraphQL API integration --- metrics_check.py | 230 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 230 insertions(+) create mode 100644 metrics_check.py diff --git a/metrics_check.py b/metrics_check.py new file mode 100644 index 0000000..4ee0969 --- /dev/null +++ b/metrics_check.py @@ -0,0 +1,230 @@ +#!/usr/bin/env python3 + +import os +import sys +import json +import requests +from datetime import datetime, timedelta + +# Check if required environment variables are set +GITHUB_TOKEN = os.environ.get('GITHUB_TOKEN') +OWNER = os.environ.get('OWNER') +REPO_NAME = os.environ.get('REPO_NAME') + +if not all([GITHUB_TOKEN, OWNER, REPO_NAME]): + print("Error: Please set GITHUB_TOKEN, OWNER, and REPO_NAME environment variables.") + sys.exit(1) + +# Headers for GraphQL API +headers = { + 'Authorization': f'bearer {GITHUB_TOKEN}', + 'Content-Type': 'application/json' +} + +# GraphQL endpoint +url = 'https://api.github.com/graphql' + +def run_query(query): + """Run a GraphQL query and return the response""" + response = requests.post(url, headers=headers, json={'query': query}) + if response.status_code == 200: + return response.json() + else: + print(f"Query failed with status code {response.status_code}") + return None + +def main(): + print(f"Checking metrics for repository: {OWNER}/{REPO_NAME}") + print("=" * 50) + + # 1. List all .md files in the root folder + print("1. Listing all .md files in the root folder...") + query1 = f''' + {{ + repository(owner: "{OWNER}", name: "{REPO_NAME}") {{ + object(expression: "HEAD:") {{ + ... on Tree {{ + entries {{ + name + }} + }} + }} + }} + }} + ''' + print(f" Query: {query1}") + result1 = run_query(query1) + print(f" Raw response: {json.dumps(result1, indent=2)}") + + if result1: + entries = result1['data']['repository']['object']['entries'] + md_files = [entry['name'] for entry in entries if entry['name'].endswith('.md')] + print(" .md files in root:") + for file in md_files: + print(f" - {file}") + print() + + # 2. Get license name + print("2. Getting license name...") + query2 = f''' + {{ + repository(owner: "{OWNER}", name: "{REPO_NAME}") {{ + licenseInfo {{ + name + }} + }} + }} + ''' + print(f" Query: {query2}") + result2 = run_query(query2) + print(f" Raw response: {json.dumps(result2, indent=2)}") + + if result2: + license_info = result2['data']['repository']['licenseInfo'] + license_name = license_info['name'] if license_info else 'None' + print(f" License name: {license_name}") + print() + + # 3. List all releases with timestamps + print("3. Listing all releases with timestamps...") + query3 = f''' + {{ + repository(owner: "{OWNER}", name: "{REPO_NAME}") {{ + releases(last: 100, orderBy: {{field: CREATED_AT, direction: DESC}}) {{ + edges {{ + node {{ + name + publishedAt + }} + }} + }} + }} + }} + ''' + print(f" Query: {query3}") + result3 = run_query(query3) + print(f" Raw response: {json.dumps(result3, indent=2)}") + + if result3: + releases = result3['data']['repository']['releases']['edges'] + print(" Releases:") + for release in releases: + node = release['node'] + name = node['name'] or 'Unnamed release' + published_at = node['publishedAt'] + print(f" - {name}: {published_at}") + print() + + # 4. List all contributors with their most recent contribution date + print("4. Listing all contributors with their most recent contribution date...") + query4 = f''' + {{ + repository(owner: "{OWNER}", name: "{REPO_NAME}") {{ + defaultBranchRef {{ + target {{ + ... on Commit {{ + history(first: 100) {{ + nodes {{ + author {{ + user {{ + login + }} + }} + committedDate + }} + }} + }} + }} + }} + }} + }} + ''' + print(f" Query: {query4}") + result4 = run_query(query4) + print(f" Raw response: {json.dumps(result4, indent=2)}") + + if result4: + commits = result4['data']['repository']['defaultBranchRef']['target']['history']['nodes'] + contributors = {} + for commit in commits: + if commit['author'] and commit['author']['user']: + login = commit['author']['user']['login'] + date = commit['committedDate'] + # Track the latest date for each contributor + if login not in contributors or date > contributors[login]: + contributors[login] = date + + print(" Contributors and their most recent contribution:") + for login, date in contributors.items(): + print(f" - {login}: {date}") + print() + + # 5. List all commits + print("5. Listing all commits...") + query5 = f''' + {{ + repository(owner: "{OWNER}", name: "{REPO_NAME}") {{ + defaultBranchRef {{ + target {{ + ... on Commit {{ + history(first: 100) {{ + nodes {{ + messageHeadline + committedDate + author {{ + name + }} + }} + }} + }} + }} + }} + }} + }} + ''' + print(f" Query: {query5}") + result5 = run_query(query5) + print(f" Raw response: {json.dumps(result5, indent=2)}") + + if result5: + commits = result5['data']['repository']['defaultBranchRef']['target']['history']['nodes'] + print(" Commits:") + for commit in commits: + message = commit['messageHeadline'] + date = commit['committedDate'] + author_name = commit['author']['name'] if commit['author'] else 'Unknown' + print(f" - {date}: {author_name} - {message}") + print() + + # 6. List all issues with creator and status + print("6. Listing all issues with creator and status...") + query6 = f''' + {{ + repository(owner: "{OWNER}", name: "{REPO_NAME}") {{ + issues(first: 100, states: [OPEN, CLOSED]) {{ + nodes {{ + title + state + author {{ + login + }} + }} + }} + }} + }} + ''' + print(f" Query: {query6}") + result6 = run_query(query6) + print(f" Raw response: {json.dumps(result6, indent=2)}") + + if result6: + issues = result6['data']['repository']['issues']['nodes'] + print(" Issues:") + for issue in issues: + title = issue['title'] + state = issue['state'] + author = issue['author']['login'] if issue['author'] else 'Unknown' + print(f" - {state}: {author} - {title}") + +if __name__ == '__main__': + main() From 60e46dd49d163ddaace7bed3a4519d8f252321a5 Mon Sep 17 00:00:00 2001 From: snowpoke Date: Mon, 22 Sep 2025 15:35:05 +0200 Subject: [PATCH 06/36] feat: add metrics calls --- .gitignore | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..aa54bbb --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +# local helpers and temp files +.local +.workspace +.aider* From 2f1dc4f9d01fd478f998d10d9bd09bd7e921956d Mon Sep 17 00:00:00 2001 From: "snowpoke (aider)" Date: Mon, 22 Sep 2025 19:26:05 +0200 Subject: [PATCH 07/36] refactor: extract graphql queries and modularize metrics check functions --- metrics_check.py | 325 ++++++++++++++++++++++++++--------------------- 1 file changed, 178 insertions(+), 147 deletions(-) diff --git a/metrics_check.py b/metrics_check.py index 4ee0969..c99fbe2 100644 --- a/metrics_check.py +++ b/metrics_check.py @@ -5,6 +5,7 @@ import json import requests from datetime import datetime, timedelta +from typing import Dict, List, Optional, Any # Check if required environment variables are set GITHUB_TOKEN = os.environ.get('GITHUB_TOKEN') @@ -24,207 +25,237 @@ # GraphQL endpoint url = 'https://api.github.com/graphql' -def run_query(query): +# GraphQL Queries +ROOT_FILES_QUERY = ''' +{ + repository(owner: "%s", name: "%s") { + object(expression: "HEAD:") { + ... on Tree { + entries { + name + } + } + } + } +} +''' + +LICENSE_QUERY = ''' +{ + repository(owner: "%s", name: "%s") { + licenseInfo { + name + } + } +} +''' + +RELEASES_QUERY = ''' +{ + repository(owner: "%s", name: "%s") { + releases(last: 100, orderBy: {field: CREATED_AT, direction: DESC}) { + edges { + node { + name + publishedAt + } + } + } + } +} +''' + +CONTRIBUTORS_QUERY = ''' +{ + repository(owner: "%s", name: "%s") { + defaultBranchRef { + target { + ... on Commit { + history(first: 100) { + nodes { + author { + user { + login + } + } + committedDate + } + } + } + } + } + } +} +''' + +COMMITS_QUERY = ''' +{ + repository(owner: "%s", name: "%s") { + defaultBranchRef { + target { + ... on Commit { + history(first: 100) { + nodes { + messageHeadline + committedDate + author { + name + } + } + } + } + } + } + } +} +''' + +ISSUES_QUERY = ''' +{ + repository(owner: "%s", name: "%s") { + issues(first: 100, states: [OPEN, CLOSED]) { + nodes { + title + state + author { + login + } + } + } + } +} +''' + +def run_query(query: str) -> Optional[Dict[Any, Any]]: """Run a GraphQL query and return the response""" - response = requests.post(url, headers=headers, json={'query': query}) - if response.status_code == 200: - return response.json() - else: - print(f"Query failed with status code {response.status_code}") + try: + response = requests.post(url, headers=headers, json={'query': query}) + if response.status_code == 200: + return response.json() + else: + print(f"Query failed with status code {response.status_code}") + return None + except Exception as e: + print(f"Query failed with exception: {e}") return None -def main(): - print(f"Checking metrics for repository: {OWNER}/{REPO_NAME}") - print("=" * 50) - - # 1. List all .md files in the root folder +def check_root_md_files(owner: str, repo_name: str) -> None: + """Check and display all .md files in the root folder""" print("1. Listing all .md files in the root folder...") - query1 = f''' - {{ - repository(owner: "{OWNER}", name: "{REPO_NAME}") {{ - object(expression: "HEAD:") {{ - ... on Tree {{ - entries {{ - name - }} - }} - }} - }} - }} - ''' - print(f" Query: {query1}") - result1 = run_query(query1) - print(f" Raw response: {json.dumps(result1, indent=2)}") + query = ROOT_FILES_QUERY % (owner, repo_name) + result = run_query(query) - if result1: - entries = result1['data']['repository']['object']['entries'] + if result and 'data' in result and result['data']['repository']['object']: + entries = result['data']['repository']['object']['entries'] md_files = [entry['name'] for entry in entries if entry['name'].endswith('.md')] print(" .md files in root:") for file in md_files: print(f" - {file}") + else: + print(" No .md files found or error occurred") print() - - # 2. Get license name + +def check_license(owner: str, repo_name: str) -> None: + """Check and display the repository license name""" print("2. Getting license name...") - query2 = f''' - {{ - repository(owner: "{OWNER}", name: "{REPO_NAME}") {{ - licenseInfo {{ - name - }} - }} - }} - ''' - print(f" Query: {query2}") - result2 = run_query(query2) - print(f" Raw response: {json.dumps(result2, indent=2)}") + query = LICENSE_QUERY % (owner, repo_name) + result = run_query(query) - if result2: - license_info = result2['data']['repository']['licenseInfo'] + if result and 'data' in result: + license_info = result['data']['repository']['licenseInfo'] license_name = license_info['name'] if license_info else 'None' print(f" License name: {license_name}") + else: + print(" Error retrieving license information") print() - - # 3. List all releases with timestamps + +def check_releases(owner: str, repo_name: str) -> None: + """Check and display all releases with timestamps""" print("3. Listing all releases with timestamps...") - query3 = f''' - {{ - repository(owner: "{OWNER}", name: "{REPO_NAME}") {{ - releases(last: 100, orderBy: {{field: CREATED_AT, direction: DESC}}) {{ - edges {{ - node {{ - name - publishedAt - }} - }} - }} - }} - }} - ''' - print(f" Query: {query3}") - result3 = run_query(query3) - print(f" Raw response: {json.dumps(result3, indent=2)}") + query = RELEASES_QUERY % (owner, repo_name) + result = run_query(query) - if result3: - releases = result3['data']['repository']['releases']['edges'] + if result and 'data' in result and result['data']['repository']['releases']: + releases = result['data']['repository']['releases']['edges'] print(" Releases:") for release in releases: node = release['node'] name = node['name'] or 'Unnamed release' published_at = node['publishedAt'] print(f" - {name}: {published_at}") + else: + print(" No releases found or error occurred") print() - - # 4. List all contributors with their most recent contribution date + +def check_contributors(owner: str, repo_name: str) -> None: + """Check and display all contributors with their most recent contribution date""" print("4. Listing all contributors with their most recent contribution date...") - query4 = f''' - {{ - repository(owner: "{OWNER}", name: "{REPO_NAME}") {{ - defaultBranchRef {{ - target {{ - ... on Commit {{ - history(first: 100) {{ - nodes {{ - author {{ - user {{ - login - }} - }} - committedDate - }} - }} - }} - }} - }} - }} - }} - ''' - print(f" Query: {query4}") - result4 = run_query(query4) - print(f" Raw response: {json.dumps(result4, indent=2)}") + query = CONTRIBUTORS_QUERY % (owner, repo_name) + result = run_query(query) - if result4: - commits = result4['data']['repository']['defaultBranchRef']['target']['history']['nodes'] - contributors = {} + if result and 'data' in result and result['data']['repository']['defaultBranchRef']: + commits = result['data']['repository']['defaultBranchRef']['target']['history']['nodes'] + contributors: Dict[str, str] = {} for commit in commits: if commit['author'] and commit['author']['user']: login = commit['author']['user']['login'] date = commit['committedDate'] - # Track the latest date for each contributor if login not in contributors or date > contributors[login]: contributors[login] = date print(" Contributors and their most recent contribution:") for login, date in contributors.items(): print(f" - {login}: {date}") + else: + print(" No contributors found or error occurred") print() - - # 5. List all commits + +def check_commits(owner: str, repo_name: str) -> None: + """Check and display all commits""" print("5. Listing all commits...") - query5 = f''' - {{ - repository(owner: "{OWNER}", name: "{REPO_NAME}") {{ - defaultBranchRef {{ - target {{ - ... on Commit {{ - history(first: 100) {{ - nodes {{ - messageHeadline - committedDate - author {{ - name - }} - }} - }} - }} - }} - }} - }} - }} - ''' - print(f" Query: {query5}") - result5 = run_query(query5) - print(f" Raw response: {json.dumps(result5, indent=2)}") + query = COMMITS_QUERY % (owner, repo_name) + result = run_query(query) - if result5: - commits = result5['data']['repository']['defaultBranchRef']['target']['history']['nodes'] + if result and 'data' in result and result['data']['repository']['defaultBranchRef']: + commits = result['data']['repository']['defaultBranchRef']['target']['history']['nodes'] print(" Commits:") for commit in commits: message = commit['messageHeadline'] date = commit['committedDate'] author_name = commit['author']['name'] if commit['author'] else 'Unknown' print(f" - {date}: {author_name} - {message}") + else: + print(" No commits found or error occurred") print() - - # 6. List all issues with creator and status + +def check_issues(owner: str, repo_name: str) -> None: + """Check and display all issues with creator and status""" print("6. Listing all issues with creator and status...") - query6 = f''' - {{ - repository(owner: "{OWNER}", name: "{REPO_NAME}") {{ - issues(first: 100, states: [OPEN, CLOSED]) {{ - nodes {{ - title - state - author {{ - login - }} - }} - }} - }} - }} - ''' - print(f" Query: {query6}") - result6 = run_query(query6) - print(f" Raw response: {json.dumps(result6, indent=2)}") + query = ISSUES_QUERY % (owner, repo_name) + result = run_query(query) - if result6: - issues = result6['data']['repository']['issues']['nodes'] + if result and 'data' in result and result['data']['repository']['issues']: + issues = result['data']['repository']['issues']['nodes'] print(" Issues:") for issue in issues: title = issue['title'] state = issue['state'] author = issue['author']['login'] if issue['author'] else 'Unknown' print(f" - {state}: {author} - {title}") + else: + print(" No issues found or error occurred") + print() + +def main() -> None: + print(f"Checking metrics for repository: {OWNER}/{REPO_NAME}") + print("=" * 50) + + check_root_md_files(OWNER, REPO_NAME) + check_license(OWNER, REPO_NAME) + check_releases(OWNER, REPO_NAME) + check_contributors(OWNER, REPO_NAME) + check_commits(OWNER, REPO_NAME) + check_issues(OWNER, REPO_NAME) if __name__ == '__main__': main() From 632fd14a4b9e10969dd0fde1f361325aff2ac2a4 Mon Sep 17 00:00:00 2001 From: "snowpoke (aider)" Date: Mon, 22 Sep 2025 19:30:37 +0200 Subject: [PATCH 08/36] refactor: separate data retrieval and presentation logic in metrics_check.py --- metrics_check.py | 139 +++++++++++++++++++++++++++++++++-------------- 1 file changed, 98 insertions(+), 41 deletions(-) diff --git a/metrics_check.py b/metrics_check.py index c99fbe2..8dcdf6d 100644 --- a/metrics_check.py +++ b/metrics_check.py @@ -139,63 +139,111 @@ def run_query(query: str) -> Optional[Dict[Any, Any]]: print(f"Query failed with exception: {e}") return None -def check_root_md_files(owner: str, repo_name: str) -> None: - """Check and display all .md files in the root folder""" +def print_root_md_files(md_files: List[str]) -> None: + """Print the .md files in the root folder""" + print(" .md files in root:") + for file in md_files: + print(f" - {file}") + +def print_license(license_name: str) -> None: + """Print the repository license name""" + print(f" License name: {license_name}") + +def print_releases(releases: List[Dict[str, str]]) -> None: + """Print all releases with timestamps""" + print(" Releases:") + for release in releases: + name = release['name'] + published_at = release['publishedAt'] + print(f" - {name}: {published_at}") + +def print_contributors(contributors: Dict[str, str]) -> None: + """Print all contributors with their most recent contribution date""" + print(" Contributors and their most recent contribution:") + for login, date in contributors.items(): + print(f" - {login}: {date}") + +def print_commits(commits: List[Dict[str, str]]) -> None: + """Print all commits""" + print(" Commits:") + for commit in commits: + message = commit['message'] + date = commit['date'] + author_name = commit['author'] + print(f" - {date}: {author_name} - {message}") + +def print_issues(issues: List[Dict[str, str]]) -> None: + """Print all issues with creator and status""" + print(" Issues:") + for issue in issues: + title = issue['title'] + state = issue['state'] + author = issue['author'] + print(f" - {state}: {author} - {title}") + +def check_root_md_files(owner: str, repo_name: str) -> List[str]: + """Check and return all .md files in the root folder""" print("1. Listing all .md files in the root folder...") query = ROOT_FILES_QUERY % (owner, repo_name) result = run_query(query) + md_files = [] if result and 'data' in result and result['data']['repository']['object']: entries = result['data']['repository']['object']['entries'] md_files = [entry['name'] for entry in entries if entry['name'].endswith('.md')] - print(" .md files in root:") - for file in md_files: - print(f" - {file}") + print_root_md_files(md_files) else: print(" No .md files found or error occurred") print() + return md_files -def check_license(owner: str, repo_name: str) -> None: - """Check and display the repository license name""" +def check_license(owner: str, repo_name: str) -> str: + """Check and return the repository license name""" print("2. Getting license name...") query = LICENSE_QUERY % (owner, repo_name) result = run_query(query) + license_name = 'None' if result and 'data' in result: license_info = result['data']['repository']['licenseInfo'] license_name = license_info['name'] if license_info else 'None' - print(f" License name: {license_name}") + print_license(license_name) else: print(" Error retrieving license information") print() + return license_name -def check_releases(owner: str, repo_name: str) -> None: - """Check and display all releases with timestamps""" +def check_releases(owner: str, repo_name: str) -> List[Dict[str, str]]: + """Check and return all releases with timestamps""" print("3. Listing all releases with timestamps...") query = RELEASES_QUERY % (owner, repo_name) result = run_query(query) + releases = [] if result and 'data' in result and result['data']['repository']['releases']: - releases = result['data']['repository']['releases']['edges'] - print(" Releases:") - for release in releases: - node = release['node'] - name = node['name'] or 'Unnamed release' - published_at = node['publishedAt'] - print(f" - {name}: {published_at}") + release_edges = result['data']['repository']['releases']['edges'] + releases = [ + { + 'name': edge['node']['name'] or 'Unnamed release', + 'publishedAt': edge['node']['publishedAt'] + } + for edge in release_edges + ] + print_releases(releases) else: print(" No releases found or error occurred") print() + return releases -def check_contributors(owner: str, repo_name: str) -> None: - """Check and display all contributors with their most recent contribution date""" +def check_contributors(owner: str, repo_name: str) -> Dict[str, str]: + """Check and return all contributors with their most recent contribution date""" print("4. Listing all contributors with their most recent contribution date...") query = CONTRIBUTORS_QUERY % (owner, repo_name) result = run_query(query) + contributors: Dict[str, str] = {} if result and 'data' in result and result['data']['repository']['defaultBranchRef']: commits = result['data']['repository']['defaultBranchRef']['target']['history']['nodes'] - contributors: Dict[str, str] = {} for commit in commits: if commit['author'] and commit['author']['user']: login = commit['author']['user']['login'] @@ -203,48 +251,57 @@ def check_contributors(owner: str, repo_name: str) -> None: if login not in contributors or date > contributors[login]: contributors[login] = date - print(" Contributors and their most recent contribution:") - for login, date in contributors.items(): - print(f" - {login}: {date}") + print_contributors(contributors) else: print(" No contributors found or error occurred") print() + return contributors -def check_commits(owner: str, repo_name: str) -> None: - """Check and display all commits""" +def check_commits(owner: str, repo_name: str) -> List[Dict[str, str]]: + """Check and return all commits""" print("5. Listing all commits...") query = COMMITS_QUERY % (owner, repo_name) result = run_query(query) + commits = [] if result and 'data' in result and result['data']['repository']['defaultBranchRef']: - commits = result['data']['repository']['defaultBranchRef']['target']['history']['nodes'] - print(" Commits:") - for commit in commits: - message = commit['messageHeadline'] - date = commit['committedDate'] - author_name = commit['author']['name'] if commit['author'] else 'Unknown' - print(f" - {date}: {author_name} - {message}") + commit_nodes = result['data']['repository']['defaultBranchRef']['target']['history']['nodes'] + commits = [ + { + 'message': commit['messageHeadline'], + 'date': commit['committedDate'], + 'author': commit['author']['name'] if commit['author'] else 'Unknown' + } + for commit in commit_nodes + ] + print_commits(commits) else: print(" No commits found or error occurred") print() + return commits -def check_issues(owner: str, repo_name: str) -> None: - """Check and display all issues with creator and status""" +def check_issues(owner: str, repo_name: str) -> List[Dict[str, str]]: + """Check and return all issues with creator and status""" print("6. Listing all issues with creator and status...") query = ISSUES_QUERY % (owner, repo_name) result = run_query(query) + issues = [] if result and 'data' in result and result['data']['repository']['issues']: - issues = result['data']['repository']['issues']['nodes'] - print(" Issues:") - for issue in issues: - title = issue['title'] - state = issue['state'] - author = issue['author']['login'] if issue['author'] else 'Unknown' - print(f" - {state}: {author} - {title}") + issue_nodes = result['data']['repository']['issues']['nodes'] + issues = [ + { + 'title': issue['title'], + 'state': issue['state'], + 'author': issue['author']['login'] if issue['author'] else 'Unknown' + } + for issue in issue_nodes + ] + print_issues(issues) else: print(" No issues found or error occurred") print() + return issues def main() -> None: print(f"Checking metrics for repository: {OWNER}/{REPO_NAME}") From 50da3bb098ba582883493965869c01ee9390c6a7 Mon Sep 17 00:00:00 2001 From: "snowpoke (aider)" Date: Mon, 22 Sep 2025 19:35:51 +0200 Subject: [PATCH 09/36] fix: convert output to newline-delimited JSON format --- metrics_check.py | 82 +++++++++++------------------------------------- 1 file changed, 18 insertions(+), 64 deletions(-) diff --git a/metrics_check.py b/metrics_check.py index 8dcdf6d..c872fef 100644 --- a/metrics_check.py +++ b/metrics_check.py @@ -140,50 +140,31 @@ def run_query(query: str) -> Optional[Dict[Any, Any]]: return None def print_root_md_files(md_files: List[str]) -> None: - """Print the .md files in the root folder""" - print(" .md files in root:") - for file in md_files: - print(f" - {file}") + """Print the .md files in the root folder as JSON""" + print(json.dumps({"metric": "root_md_files", "data": md_files})) def print_license(license_name: str) -> None: - """Print the repository license name""" - print(f" License name: {license_name}") + """Print the repository license name as JSON""" + print(json.dumps({"metric": "license", "data": license_name})) def print_releases(releases: List[Dict[str, str]]) -> None: - """Print all releases with timestamps""" - print(" Releases:") - for release in releases: - name = release['name'] - published_at = release['publishedAt'] - print(f" - {name}: {published_at}") + """Print all releases with timestamps as JSON""" + print(json.dumps({"metric": "releases", "data": releases})) def print_contributors(contributors: Dict[str, str]) -> None: - """Print all contributors with their most recent contribution date""" - print(" Contributors and their most recent contribution:") - for login, date in contributors.items(): - print(f" - {login}: {date}") + """Print all contributors with their most recent contribution date as JSON""" + print(json.dumps({"metric": "contributors", "data": contributors})) def print_commits(commits: List[Dict[str, str]]) -> None: - """Print all commits""" - print(" Commits:") - for commit in commits: - message = commit['message'] - date = commit['date'] - author_name = commit['author'] - print(f" - {date}: {author_name} - {message}") + """Print all commits as JSON""" + print(json.dumps({"metric": "commits", "data": commits})) def print_issues(issues: List[Dict[str, str]]) -> None: - """Print all issues with creator and status""" - print(" Issues:") - for issue in issues: - title = issue['title'] - state = issue['state'] - author = issue['author'] - print(f" - {state}: {author} - {title}") + """Print all issues with creator and status as JSON""" + print(json.dumps({"metric": "issues", "data": issues})) def check_root_md_files(owner: str, repo_name: str) -> List[str]: """Check and return all .md files in the root folder""" - print("1. Listing all .md files in the root folder...") query = ROOT_FILES_QUERY % (owner, repo_name) result = run_query(query) @@ -191,15 +172,11 @@ def check_root_md_files(owner: str, repo_name: str) -> List[str]: if result and 'data' in result and result['data']['repository']['object']: entries = result['data']['repository']['object']['entries'] md_files = [entry['name'] for entry in entries if entry['name'].endswith('.md')] - print_root_md_files(md_files) - else: - print(" No .md files found or error occurred") - print() + print_root_md_files(md_files) return md_files def check_license(owner: str, repo_name: str) -> str: """Check and return the repository license name""" - print("2. Getting license name...") query = LICENSE_QUERY % (owner, repo_name) result = run_query(query) @@ -207,15 +184,11 @@ def check_license(owner: str, repo_name: str) -> str: if result and 'data' in result: license_info = result['data']['repository']['licenseInfo'] license_name = license_info['name'] if license_info else 'None' - print_license(license_name) - else: - print(" Error retrieving license information") - print() + print_license(license_name) return license_name def check_releases(owner: str, repo_name: str) -> List[Dict[str, str]]: """Check and return all releases with timestamps""" - print("3. Listing all releases with timestamps...") query = RELEASES_QUERY % (owner, repo_name) result = run_query(query) @@ -229,15 +202,11 @@ def check_releases(owner: str, repo_name: str) -> List[Dict[str, str]]: } for edge in release_edges ] - print_releases(releases) - else: - print(" No releases found or error occurred") - print() + print_releases(releases) return releases def check_contributors(owner: str, repo_name: str) -> Dict[str, str]: """Check and return all contributors with their most recent contribution date""" - print("4. Listing all contributors with their most recent contribution date...") query = CONTRIBUTORS_QUERY % (owner, repo_name) result = run_query(query) @@ -250,16 +219,11 @@ def check_contributors(owner: str, repo_name: str) -> Dict[str, str]: date = commit['committedDate'] if login not in contributors or date > contributors[login]: contributors[login] = date - - print_contributors(contributors) - else: - print(" No contributors found or error occurred") - print() + print_contributors(contributors) return contributors def check_commits(owner: str, repo_name: str) -> List[Dict[str, str]]: """Check and return all commits""" - print("5. Listing all commits...") query = COMMITS_QUERY % (owner, repo_name) result = run_query(query) @@ -274,15 +238,11 @@ def check_commits(owner: str, repo_name: str) -> List[Dict[str, str]]: } for commit in commit_nodes ] - print_commits(commits) - else: - print(" No commits found or error occurred") - print() + print_commits(commits) return commits def check_issues(owner: str, repo_name: str) -> List[Dict[str, str]]: """Check and return all issues with creator and status""" - print("6. Listing all issues with creator and status...") query = ISSUES_QUERY % (owner, repo_name) result = run_query(query) @@ -297,16 +257,10 @@ def check_issues(owner: str, repo_name: str) -> List[Dict[str, str]]: } for issue in issue_nodes ] - print_issues(issues) - else: - print(" No issues found or error occurred") - print() + print_issues(issues) return issues def main() -> None: - print(f"Checking metrics for repository: {OWNER}/{REPO_NAME}") - print("=" * 50) - check_root_md_files(OWNER, REPO_NAME) check_license(OWNER, REPO_NAME) check_releases(OWNER, REPO_NAME) From c0eb9f0bfa0e3a41b1747e165fc6fd4e8ee415b3 Mon Sep 17 00:00:00 2001 From: "snowpoke (aider)" Date: Mon, 22 Sep 2025 19:43:33 +0200 Subject: [PATCH 10/36] refactor: convert metrics output to individual JSONL files per metric --- metrics_check.py | 59 ++++++++++++++++++++++++++++-------------------- 1 file changed, 35 insertions(+), 24 deletions(-) diff --git a/metrics_check.py b/metrics_check.py index c872fef..2812ed4 100644 --- a/metrics_check.py +++ b/metrics_check.py @@ -139,29 +139,40 @@ def run_query(query: str) -> Optional[Dict[Any, Any]]: print(f"Query failed with exception: {e}") return None -def print_root_md_files(md_files: List[str]) -> None: - """Print the .md files in the root folder as JSON""" - print(json.dumps({"metric": "root_md_files", "data": md_files})) +def write_root_md_files(md_files: List[str]) -> None: + """Write the .md files in the root folder as JSONL""" + with open('root_md_files.jsonl', 'w') as f: + for file in md_files: + f.write(json.dumps({"file": file}) + '\n') -def print_license(license_name: str) -> None: - """Print the repository license name as JSON""" - print(json.dumps({"metric": "license", "data": license_name})) +def write_license(license_name: str) -> None: + """Write the repository license name as JSONL""" + with open('license.jsonl', 'w') as f: + f.write(json.dumps({"license": license_name}) + '\n') -def print_releases(releases: List[Dict[str, str]]) -> None: - """Print all releases with timestamps as JSON""" - print(json.dumps({"metric": "releases", "data": releases})) +def write_releases(releases: List[Dict[str, str]]) -> None: + """Write all releases with timestamps as JSONL""" + with open('releases.jsonl', 'w') as f: + for release in releases: + f.write(json.dumps(release) + '\n') -def print_contributors(contributors: Dict[str, str]) -> None: - """Print all contributors with their most recent contribution date as JSON""" - print(json.dumps({"metric": "contributors", "data": contributors})) +def write_contributors(contributors: Dict[str, str]) -> None: + """Write all contributors with their most recent contribution date as JSONL""" + with open('contributors.jsonl', 'w') as f: + for login, date in contributors.items(): + f.write(json.dumps({"login": login, "last_contribution": date}) + '\n') -def print_commits(commits: List[Dict[str, str]]) -> None: - """Print all commits as JSON""" - print(json.dumps({"metric": "commits", "data": commits})) +def write_commits(commits: List[Dict[str, str]]) -> None: + """Write all commits as JSONL""" + with open('commits.jsonl', 'w') as f: + for commit in commits: + f.write(json.dumps(commit) + '\n') -def print_issues(issues: List[Dict[str, str]]) -> None: - """Print all issues with creator and status as JSON""" - print(json.dumps({"metric": "issues", "data": issues})) +def write_issues(issues: List[Dict[str, str]]) -> None: + """Write all issues with creator and status as JSONL""" + with open('issues.jsonl', 'w') as f: + for issue in issues: + f.write(json.dumps(issue) + '\n') def check_root_md_files(owner: str, repo_name: str) -> List[str]: """Check and return all .md files in the root folder""" @@ -172,7 +183,7 @@ def check_root_md_files(owner: str, repo_name: str) -> List[str]: if result and 'data' in result and result['data']['repository']['object']: entries = result['data']['repository']['object']['entries'] md_files = [entry['name'] for entry in entries if entry['name'].endswith('.md')] - print_root_md_files(md_files) + write_root_md_files(md_files) return md_files def check_license(owner: str, repo_name: str) -> str: @@ -184,7 +195,7 @@ def check_license(owner: str, repo_name: str) -> str: if result and 'data' in result: license_info = result['data']['repository']['licenseInfo'] license_name = license_info['name'] if license_info else 'None' - print_license(license_name) + write_license(license_name) return license_name def check_releases(owner: str, repo_name: str) -> List[Dict[str, str]]: @@ -202,7 +213,7 @@ def check_releases(owner: str, repo_name: str) -> List[Dict[str, str]]: } for edge in release_edges ] - print_releases(releases) + write_releases(releases) return releases def check_contributors(owner: str, repo_name: str) -> Dict[str, str]: @@ -219,7 +230,7 @@ def check_contributors(owner: str, repo_name: str) -> Dict[str, str]: date = commit['committedDate'] if login not in contributors or date > contributors[login]: contributors[login] = date - print_contributors(contributors) + write_contributors(contributors) return contributors def check_commits(owner: str, repo_name: str) -> List[Dict[str, str]]: @@ -238,7 +249,7 @@ def check_commits(owner: str, repo_name: str) -> List[Dict[str, str]]: } for commit in commit_nodes ] - print_commits(commits) + write_commits(commits) return commits def check_issues(owner: str, repo_name: str) -> List[Dict[str, str]]: @@ -257,7 +268,7 @@ def check_issues(owner: str, repo_name: str) -> List[Dict[str, str]]: } for issue in issue_nodes ] - print_issues(issues) + write_issues(issues) return issues def main() -> None: From a8f54486a67167796592d71c1406ca5434b426a3 Mon Sep 17 00:00:00 2001 From: "snowpoke (aider)" Date: Mon, 22 Sep 2025 19:45:59 +0200 Subject: [PATCH 11/36] refactor: Implement pagination and one-year date filtering for all GitHub metrics queries --- metrics_check.py | 230 +++++++++++++++++++++++++++++++++-------------- 1 file changed, 165 insertions(+), 65 deletions(-) diff --git a/metrics_check.py b/metrics_check.py index 2812ed4..c019b60 100644 --- a/metrics_check.py +++ b/metrics_check.py @@ -25,7 +25,10 @@ # GraphQL endpoint url = 'https://api.github.com/graphql' -# GraphQL Queries +# Calculate the date for one year ago +one_year_ago = (datetime.now() - timedelta(days=365)).strftime('%Y-%m-%dT%H:%M:%SZ') + +# GraphQL Queries with pagination and date filtering ROOT_FILES_QUERY = ''' { repository(owner: "%s", name: "%s") { @@ -51,14 +54,19 @@ ''' RELEASES_QUERY = ''' -{ +query($cursor: String) { repository(owner: "%s", name: "%s") { - releases(last: 100, orderBy: {field: CREATED_AT, direction: DESC}) { + releases(first: 100, orderBy: {field: CREATED_AT, direction: DESC}, after: $cursor) { edges { node { name publishedAt } + cursor + } + pageInfo { + hasNextPage + endCursor } } } @@ -66,12 +74,12 @@ ''' CONTRIBUTORS_QUERY = ''' -{ +query($cursor: String, $since: GitTimestamp!) { repository(owner: "%s", name: "%s") { defaultBranchRef { target { ... on Commit { - history(first: 100) { + history(first: 100, since: $since, after: $cursor) { nodes { author { user { @@ -80,6 +88,10 @@ } committedDate } + pageInfo { + hasNextPage + endCursor + } } } } @@ -89,12 +101,12 @@ ''' COMMITS_QUERY = ''' -{ +query($cursor: String, $since: GitTimestamp!) { repository(owner: "%s", name: "%s") { defaultBranchRef { target { ... on Commit { - history(first: 100) { + history(first: 100, since: $since, after: $cursor) { nodes { messageHeadline committedDate @@ -102,6 +114,10 @@ name } } + pageInfo { + hasNextPage + endCursor + } } } } @@ -111,25 +127,30 @@ ''' ISSUES_QUERY = ''' -{ +query($cursor: String) { repository(owner: "%s", name: "%s") { - issues(first: 100, states: [OPEN, CLOSED]) { + issues(first: 100, states: [OPEN, CLOSED], after: $cursor, orderBy: {field: CREATED_AT, direction: DESC}) { nodes { title state author { login } + createdAt + } + pageInfo { + hasNextPage + endCursor } } } } ''' -def run_query(query: str) -> Optional[Dict[Any, Any]]: - """Run a GraphQL query and return the response""" +def run_query(query: str, variables: Dict = {}) -> Optional[Dict[Any, Any]]: + """Run a GraphQL query with variables and return the response""" try: - response = requests.post(url, headers=headers, json={'query': query}) + response = requests.post(url, headers=headers, json={'query': query, 'variables': variables}) if response.status_code == 200: return response.json() else: @@ -199,75 +220,154 @@ def check_license(owner: str, repo_name: str) -> str: return license_name def check_releases(owner: str, repo_name: str) -> List[Dict[str, str]]: - """Check and return all releases with timestamps""" - query = RELEASES_QUERY % (owner, repo_name) - result = run_query(query) - + """Check and return all releases with timestamps from the past year""" releases = [] - if result and 'data' in result and result['data']['repository']['releases']: - release_edges = result['data']['repository']['releases']['edges'] - releases = [ - { - 'name': edge['node']['name'] or 'Unnamed release', - 'publishedAt': edge['node']['publishedAt'] - } - for edge in release_edges - ] + has_next_page = True + cursor = None + + while has_next_page: + query = RELEASES_QUERY % (owner, repo_name) + variables = {} + if cursor: + variables['cursor'] = cursor + + result = run_query(query, variables) + + if result and 'data' in result and result['data']['repository']['releases']: + release_edges = result['data']['repository']['releases']['edges'] + + # Filter releases from the past year + for edge in release_edges: + published_at = edge['node']['publishedAt'] + if published_at and published_at >= one_year_ago: + releases.append({ + 'name': edge['node']['name'] or 'Unnamed release', + 'publishedAt': published_at + }) + elif published_at and published_at < one_year_ago: + # Stop if we've gone past the one-year boundary + has_next_page = False + break + + # Check pagination info + page_info = result['data']['repository']['releases']['pageInfo'] + has_next_page = page_info['hasNextPage'] and has_next_page + cursor = page_info['endCursor'] + else: + has_next_page = False + write_releases(releases) return releases def check_contributors(owner: str, repo_name: str) -> Dict[str, str]: - """Check and return all contributors with their most recent contribution date""" - query = CONTRIBUTORS_QUERY % (owner, repo_name) - result = run_query(query) - + """Check and return all contributors with their most recent contribution date from the past year""" contributors: Dict[str, str] = {} - if result and 'data' in result and result['data']['repository']['defaultBranchRef']: - commits = result['data']['repository']['defaultBranchRef']['target']['history']['nodes'] - for commit in commits: - if commit['author'] and commit['author']['user']: - login = commit['author']['user']['login'] - date = commit['committedDate'] - if login not in contributors or date > contributors[login]: - contributors[login] = date + has_next_page = True + cursor = None + + while has_next_page: + query = CONTRIBUTORS_QUERY % (owner, repo_name) + variables = {'since': one_year_ago} + if cursor: + variables['cursor'] = cursor + + result = run_query(query, variables) + + if result and 'data' in result and result['data']['repository']['defaultBranchRef']: + history = result['data']['repository']['defaultBranchRef']['target']['history'] + commits = history['nodes'] + + for commit in commits: + if commit['author'] and commit['author']['user']: + login = commit['author']['user']['login'] + date = commit['committedDate'] + if login not in contributors or date > contributors[login]: + contributors[login] = date + + # Check pagination info + page_info = history['pageInfo'] + has_next_page = page_info['hasNextPage'] + cursor = page_info['endCursor'] + else: + has_next_page = False + write_contributors(contributors) return contributors def check_commits(owner: str, repo_name: str) -> List[Dict[str, str]]: - """Check and return all commits""" - query = COMMITS_QUERY % (owner, repo_name) - result = run_query(query) - + """Check and return all commits from the past year""" commits = [] - if result and 'data' in result and result['data']['repository']['defaultBranchRef']: - commit_nodes = result['data']['repository']['defaultBranchRef']['target']['history']['nodes'] - commits = [ - { - 'message': commit['messageHeadline'], - 'date': commit['committedDate'], - 'author': commit['author']['name'] if commit['author'] else 'Unknown' - } - for commit in commit_nodes - ] + has_next_page = True + cursor = None + + while has_next_page: + query = COMMITS_QUERY % (owner, repo_name) + variables = {'since': one_year_ago} + if cursor: + variables['cursor'] = cursor + + result = run_query(query, variables) + + if result and 'data' in result and result['data']['repository']['defaultBranchRef']: + history = result['data']['repository']['defaultBranchRef']['target']['history'] + commit_nodes = history['nodes'] + + for commit in commit_nodes: + commits.append({ + 'message': commit['messageHeadline'], + 'date': commit['committedDate'], + 'author': commit['author']['name'] if commit['author'] else 'Unknown' + }) + + # Check pagination info + page_info = history['pageInfo'] + has_next_page = page_info['hasNextPage'] + cursor = page_info['endCursor'] + else: + has_next_page = False + write_commits(commits) return commits def check_issues(owner: str, repo_name: str) -> List[Dict[str, str]]: - """Check and return all issues with creator and status""" - query = ISSUES_QUERY % (owner, repo_name) - result = run_query(query) - + """Check and return all issues with creator and status from the past year""" issues = [] - if result and 'data' in result and result['data']['repository']['issues']: - issue_nodes = result['data']['repository']['issues']['nodes'] - issues = [ - { - 'title': issue['title'], - 'state': issue['state'], - 'author': issue['author']['login'] if issue['author'] else 'Unknown' - } - for issue in issue_nodes - ] + has_next_page = True + cursor = None + + while has_next_page: + query = ISSUES_QUERY % (owner, repo_name) + variables = {} + if cursor: + variables['cursor'] = cursor + + result = run_query(query, variables) + + if result and 'data' in result and result['data']['repository']['issues']: + issue_nodes = result['data']['repository']['issues']['nodes'] + + # Filter issues from the past year + for issue in issue_nodes: + created_at = issue['createdAt'] + if created_at and created_at >= one_year_ago: + issues.append({ + 'title': issue['title'], + 'state': issue['state'], + 'author': issue['author']['login'] if issue['author'] else 'Unknown', + 'createdAt': created_at + }) + elif created_at and created_at < one_year_ago: + # Stop if we've gone past the one-year boundary + has_next_page = False + break + + # Check pagination info + page_info = result['data']['repository']['issues']['pageInfo'] + has_next_page = page_info['hasNextPage'] and has_next_page + cursor = page_info['endCursor'] + else: + has_next_page = False + write_issues(issues) return issues From eb4bbe4eb88c4762ce91d7642ed808245a73abd7 Mon Sep 17 00:00:00 2001 From: "snowpoke (aider)" Date: Mon, 22 Sep 2025 19:50:01 +0200 Subject: [PATCH 12/36] refactor: extract pagination logic into generic helper function and remove code duplication --- metrics_check.py | 269 +++++++++++++++++++++++++---------------------- 1 file changed, 142 insertions(+), 127 deletions(-) diff --git a/metrics_check.py b/metrics_check.py index c019b60..6f8263a 100644 --- a/metrics_check.py +++ b/metrics_check.py @@ -5,7 +5,7 @@ import json import requests from datetime import datetime, timedelta -from typing import Dict, List, Optional, Any +from typing import Dict, List, Optional, Any, Callable, Tuple # Check if required environment variables are set GITHUB_TOKEN = os.environ.get('GITHUB_TOKEN') @@ -160,6 +160,42 @@ def run_query(query: str, variables: Dict = {}) -> Optional[Dict[Any, Any]]: print(f"Query failed with exception: {e}") return None +def paginate_github_query( + query: str, + extract_function: Callable[[Dict], Tuple[List[Any], Optional[Dict]]], + initial_variables: Optional[Dict] = None +) -> List[Any]: + """Generic pagination function for GitHub GraphQL API""" + if initial_variables is None: + initial_variables = {} + + all_data = [] + has_next_page = True + cursor = None + variables = initial_variables.copy() + + while has_next_page: + if cursor: + variables['cursor'] = cursor + else: + # Remove cursor from variables if it's None + variables.pop('cursor', None) + + result = run_query(query, variables) + + if not result: + break + + data_batch, page_info = extract_function(result) + all_data.extend(data_batch) + + if page_info and page_info.get('hasNextPage'): + cursor = page_info.get('endCursor') + else: + has_next_page = False + + return all_data + def write_root_md_files(md_files: List[str]) -> None: """Write the .md files in the root folder as JSONL""" with open('root_md_files.jsonl', 'w') as f: @@ -221,153 +257,132 @@ def check_license(owner: str, repo_name: str) -> str: def check_releases(owner: str, repo_name: str) -> List[Dict[str, str]]: """Check and return all releases with timestamps from the past year""" - releases = [] - has_next_page = True - cursor = None - - while has_next_page: - query = RELEASES_QUERY % (owner, repo_name) - variables = {} - if cursor: - variables['cursor'] = cursor - - result = run_query(query, variables) + def extract_releases(result: Dict) -> Tuple[List[Dict[str, str]], Optional[Dict]]: + releases = [] + page_info = None - if result and 'data' in result and result['data']['repository']['releases']: - release_edges = result['data']['repository']['releases']['edges'] - - # Filter releases from the past year - for edge in release_edges: - published_at = edge['node']['publishedAt'] - if published_at and published_at >= one_year_ago: - releases.append({ - 'name': edge['node']['name'] or 'Unnamed release', - 'publishedAt': published_at - }) - elif published_at and published_at < one_year_ago: - # Stop if we've gone past the one-year boundary - has_next_page = False - break - - # Check pagination info - page_info = result['data']['repository']['releases']['pageInfo'] - has_next_page = page_info['hasNextPage'] and has_next_page - cursor = page_info['endCursor'] - else: - has_next_page = False + if 'data' in result and result['data']['repository']: + repo_data = result['data']['repository'] + if repo_data.get('releases'): + release_edges = repo_data['releases']['edges'] + page_info = repo_data['releases']['pageInfo'] + + # Filter releases from the past year + for edge in release_edges: + published_at = edge['node']['publishedAt'] + if published_at and published_at >= one_year_ago: + releases.append({ + 'name': edge['node']['name'] or 'Unnamed release', + 'publishedAt': published_at + }) + elif published_at and published_at < one_year_ago: + # Stop pagination if we've gone past the one-year boundary + return releases, None + + return releases, page_info + query = RELEASES_QUERY % (owner, repo_name) + releases = paginate_github_query(query, extract_releases) write_releases(releases) return releases def check_contributors(owner: str, repo_name: str) -> Dict[str, str]: """Check and return all contributors with their most recent contribution date from the past year""" - contributors: Dict[str, str] = {} - has_next_page = True - cursor = None - - while has_next_page: - query = CONTRIBUTORS_QUERY % (owner, repo_name) - variables = {'since': one_year_ago} - if cursor: - variables['cursor'] = cursor - - result = run_query(query, variables) + def extract_contributors(result: Dict) -> Tuple[List[Dict[str, str]], Optional[Dict]]: + contributors: Dict[str, str] = {} + page_info = None - if result and 'data' in result and result['data']['repository']['defaultBranchRef']: - history = result['data']['repository']['defaultBranchRef']['target']['history'] - commits = history['nodes'] - - for commit in commits: - if commit['author'] and commit['author']['user']: - login = commit['author']['user']['login'] - date = commit['committedDate'] - if login not in contributors or date > contributors[login]: - contributors[login] = date - - # Check pagination info - page_info = history['pageInfo'] - has_next_page = page_info['hasNextPage'] - cursor = page_info['endCursor'] - else: - has_next_page = False + if 'data' in result and result['data']['repository']: + repo_data = result['data']['repository'] + if repo_data.get('defaultBranchRef') and repo_data['defaultBranchRef'].get('target'): + target = repo_data['defaultBranchRef']['target'] + if target.get('history'): + history = target['history'] + commit_nodes = history['nodes'] + page_info = history['pageInfo'] + + for commit in commit_nodes: + if commit.get('author') and commit['author'].get('user'): + login = commit['author']['user']['login'] + date = commit['committedDate'] + if login and date: + if login not in contributors or date > contributors[login]: + contributors[login] = date + + return [contributors], page_info - write_contributors(contributors) - return contributors + query = CONTRIBUTORS_QUERY % (owner, repo_name) + contributor_list = paginate_github_query(query, extract_contributors, {'since': one_year_ago}) + + # Merge all contributor dictionaries + final_contributors: Dict[str, str] = {} + for contributors in contributor_list: + for login, date in contributors.items(): + if login not in final_contributors or date > final_contributors[login]: + final_contributors[login] = date + + write_contributors(final_contributors) + return final_contributors def check_commits(owner: str, repo_name: str) -> List[Dict[str, str]]: """Check and return all commits from the past year""" - commits = [] - has_next_page = True - cursor = None - - while has_next_page: - query = COMMITS_QUERY % (owner, repo_name) - variables = {'since': one_year_ago} - if cursor: - variables['cursor'] = cursor - - result = run_query(query, variables) + def extract_commits(result: Dict) -> Tuple[List[Dict[str, str]], Optional[Dict]]: + commits = [] + page_info = None - if result and 'data' in result and result['data']['repository']['defaultBranchRef']: - history = result['data']['repository']['defaultBranchRef']['target']['history'] - commit_nodes = history['nodes'] - - for commit in commit_nodes: - commits.append({ - 'message': commit['messageHeadline'], - 'date': commit['committedDate'], - 'author': commit['author']['name'] if commit['author'] else 'Unknown' - }) - - # Check pagination info - page_info = history['pageInfo'] - has_next_page = page_info['hasNextPage'] - cursor = page_info['endCursor'] - else: - has_next_page = False + if 'data' in result and result['data']['repository']: + repo_data = result['data']['repository'] + if repo_data.get('defaultBranchRef') and repo_data['defaultBranchRef'].get('target'): + target = repo_data['defaultBranchRef']['target'] + if target.get('history'): + history = target['history'] + commit_nodes = history['nodes'] + page_info = history['pageInfo'] + + for commit in commit_nodes: + commits.append({ + 'message': commit.get('messageHeadline', ''), + 'date': commit.get('committedDate', ''), + 'author': commit.get('author', {}).get('name', 'Unknown') if commit.get('author') else 'Unknown' + }) + + return commits, page_info + query = COMMITS_QUERY % (owner, repo_name) + commits = paginate_github_query(query, extract_commits, {'since': one_year_ago}) write_commits(commits) return commits def check_issues(owner: str, repo_name: str) -> List[Dict[str, str]]: """Check and return all issues with creator and status from the past year""" - issues = [] - has_next_page = True - cursor = None - - while has_next_page: - query = ISSUES_QUERY % (owner, repo_name) - variables = {} - if cursor: - variables['cursor'] = cursor - - result = run_query(query, variables) + def extract_issues(result: Dict) -> Tuple[List[Dict[str, str]], Optional[Dict]]: + issues = [] + page_info = None - if result and 'data' in result and result['data']['repository']['issues']: - issue_nodes = result['data']['repository']['issues']['nodes'] - - # Filter issues from the past year - for issue in issue_nodes: - created_at = issue['createdAt'] - if created_at and created_at >= one_year_ago: - issues.append({ - 'title': issue['title'], - 'state': issue['state'], - 'author': issue['author']['login'] if issue['author'] else 'Unknown', - 'createdAt': created_at - }) - elif created_at and created_at < one_year_ago: - # Stop if we've gone past the one-year boundary - has_next_page = False - break - - # Check pagination info - page_info = result['data']['repository']['issues']['pageInfo'] - has_next_page = page_info['hasNextPage'] and has_next_page - cursor = page_info['endCursor'] - else: - has_next_page = False + if 'data' in result and result['data']['repository']: + repo_data = result['data']['repository'] + if repo_data.get('issues'): + issue_nodes = repo_data['issues']['nodes'] + page_info = repo_data['issues']['pageInfo'] + + # Filter issues from the past year + for issue in issue_nodes: + created_at = issue.get('createdAt') + if created_at and created_at >= one_year_ago: + issues.append({ + 'title': issue.get('title', ''), + 'state': issue.get('state', ''), + 'author': issue.get('author', {}).get('login', 'Unknown') if issue.get('author') else 'Unknown', + 'createdAt': created_at + }) + elif created_at and created_at < one_year_ago: + # Stop pagination if we've gone past the one-year boundary + return issues, None + + return issues, page_info + query = ISSUES_QUERY % (owner, repo_name) + issues = paginate_github_query(query, extract_issues) write_issues(issues) return issues From 7768cbdbe40bf111d55a6b3d4aad853ff3b6f002 Mon Sep 17 00:00:00 2001 From: "snowpoke (aider)" Date: Mon, 22 Sep 2025 19:57:49 +0200 Subject: [PATCH 13/36] feat: add comprehensive logging to monitor script execution and performance --- metrics_check.py | 59 +++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 56 insertions(+), 3 deletions(-) diff --git a/metrics_check.py b/metrics_check.py index 6f8263a..ed397bd 100644 --- a/metrics_check.py +++ b/metrics_check.py @@ -4,16 +4,27 @@ import sys import json import requests +import logging from datetime import datetime, timedelta from typing import Dict, List, Optional, Any, Callable, Tuple +# Set up logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s', + handlers=[ + logging.StreamHandler(sys.stdout) + ] +) +logger = logging.getLogger(__name__) + # Check if required environment variables are set GITHUB_TOKEN = os.environ.get('GITHUB_TOKEN') OWNER = os.environ.get('OWNER') REPO_NAME = os.environ.get('REPO_NAME') if not all([GITHUB_TOKEN, OWNER, REPO_NAME]): - print("Error: Please set GITHUB_TOKEN, OWNER, and REPO_NAME environment variables.") + logger.error("Please set GITHUB_TOKEN, OWNER, and REPO_NAME environment variables.") sys.exit(1) # Headers for GraphQL API @@ -150,14 +161,16 @@ def run_query(query: str, variables: Dict = {}) -> Optional[Dict[Any, Any]]: """Run a GraphQL query with variables and return the response""" try: + logger.debug(f"Running query with variables: {variables}") response = requests.post(url, headers=headers, json={'query': query, 'variables': variables}) if response.status_code == 200: + logger.debug("Query successful") return response.json() else: - print(f"Query failed with status code {response.status_code}") + logger.error(f"Query failed with status code {response.status_code}") return None except Exception as e: - print(f"Query failed with exception: {e}") + logger.error(f"Query failed with exception: {e}") return None def paginate_github_query( @@ -174,7 +187,11 @@ def paginate_github_query( cursor = None variables = initial_variables.copy() + page_count = 0 while has_next_page: + page_count += 1 + logger.info(f"Fetching page {page_count}...") + if cursor: variables['cursor'] = cursor else: @@ -184,55 +201,67 @@ def paginate_github_query( result = run_query(query, variables) if not result: + logger.warning("Query returned no result, stopping pagination") break data_batch, page_info = extract_function(result) all_data.extend(data_batch) + logger.debug(f"Retrieved {len(data_batch)} items in this batch") + if page_info and page_info.get('hasNextPage'): cursor = page_info.get('endCursor') + logger.debug(f"Next cursor: {cursor}") else: has_next_page = False + logger.info(f"Pagination complete. Total pages: {page_count}, Total items: {len(all_data)}") return all_data def write_root_md_files(md_files: List[str]) -> None: """Write the .md files in the root folder as JSONL""" + logger.info(f"Writing {len(md_files)} .md files to root_md_files.jsonl") with open('root_md_files.jsonl', 'w') as f: for file in md_files: f.write(json.dumps({"file": file}) + '\n') def write_license(license_name: str) -> None: """Write the repository license name as JSONL""" + logger.info(f"Writing license to license.jsonl: {license_name}") with open('license.jsonl', 'w') as f: f.write(json.dumps({"license": license_name}) + '\n') def write_releases(releases: List[Dict[str, str]]) -> None: """Write all releases with timestamps as JSONL""" + logger.info(f"Writing {len(releases)} releases to releases.jsonl") with open('releases.jsonl', 'w') as f: for release in releases: f.write(json.dumps(release) + '\n') def write_contributors(contributors: Dict[str, str]) -> None: """Write all contributors with their most recent contribution date as JSONL""" + logger.info(f"Writing {len(contributors)} contributors to contributors.jsonl") with open('contributors.jsonl', 'w') as f: for login, date in contributors.items(): f.write(json.dumps({"login": login, "last_contribution": date}) + '\n') def write_commits(commits: List[Dict[str, str]]) -> None: """Write all commits as JSONL""" + logger.info(f"Writing {len(commits)} commits to commits.jsonl") with open('commits.jsonl', 'w') as f: for commit in commits: f.write(json.dumps(commit) + '\n') def write_issues(issues: List[Dict[str, str]]) -> None: """Write all issues with creator and status as JSONL""" + logger.info(f"Writing {len(issues)} issues to issues.jsonl") with open('issues.jsonl', 'w') as f: for issue in issues: f.write(json.dumps(issue) + '\n') def check_root_md_files(owner: str, repo_name: str) -> List[str]: """Check and return all .md files in the root folder""" + logger.info("Checking root .md files...") query = ROOT_FILES_QUERY % (owner, repo_name) result = run_query(query) @@ -240,11 +269,15 @@ def check_root_md_files(owner: str, repo_name: str) -> List[str]: if result and 'data' in result and result['data']['repository']['object']: entries = result['data']['repository']['object']['entries'] md_files = [entry['name'] for entry in entries if entry['name'].endswith('.md')] + logger.info(f"Found {len(md_files)} .md files in root") + else: + logger.warning("No .md files found or error occurred") write_root_md_files(md_files) return md_files def check_license(owner: str, repo_name: str) -> str: """Check and return the repository license name""" + logger.info("Checking license...") query = LICENSE_QUERY % (owner, repo_name) result = run_query(query) @@ -252,11 +285,15 @@ def check_license(owner: str, repo_name: str) -> str: if result and 'data' in result: license_info = result['data']['repository']['licenseInfo'] license_name = license_info['name'] if license_info else 'None' + logger.info(f"License found: {license_name}") + else: + logger.warning("Error retrieving license information") write_license(license_name) return license_name def check_releases(owner: str, repo_name: str) -> List[Dict[str, str]]: """Check and return all releases with timestamps from the past year""" + logger.info("Checking releases...") def extract_releases(result: Dict) -> Tuple[List[Dict[str, str]], Optional[Dict]]: releases = [] page_info = None @@ -277,17 +314,20 @@ def extract_releases(result: Dict) -> Tuple[List[Dict[str, str]], Optional[Dict] }) elif published_at and published_at < one_year_ago: # Stop pagination if we've gone past the one-year boundary + logger.debug("Reached releases older than one year, stopping pagination") return releases, None return releases, page_info query = RELEASES_QUERY % (owner, repo_name) releases = paginate_github_query(query, extract_releases) + logger.info(f"Found {len(releases)} releases in the past year") write_releases(releases) return releases def check_contributors(owner: str, repo_name: str) -> Dict[str, str]: """Check and return all contributors with their most recent contribution date from the past year""" + logger.info("Checking contributors...") def extract_contributors(result: Dict) -> Tuple[List[Dict[str, str]], Optional[Dict]]: contributors: Dict[str, str] = {} page_info = None @@ -321,11 +361,13 @@ def extract_contributors(result: Dict) -> Tuple[List[Dict[str, str]], Optional[D if login not in final_contributors or date > final_contributors[login]: final_contributors[login] = date + logger.info(f"Found {len(final_contributors)} contributors in the past year") write_contributors(final_contributors) return final_contributors def check_commits(owner: str, repo_name: str) -> List[Dict[str, str]]: """Check and return all commits from the past year""" + logger.info("Checking commits...") def extract_commits(result: Dict) -> Tuple[List[Dict[str, str]], Optional[Dict]]: commits = [] page_info = None @@ -350,11 +392,13 @@ def extract_commits(result: Dict) -> Tuple[List[Dict[str, str]], Optional[Dict]] query = COMMITS_QUERY % (owner, repo_name) commits = paginate_github_query(query, extract_commits, {'since': one_year_ago}) + logger.info(f"Found {len(commits)} commits in the past year") write_commits(commits) return commits def check_issues(owner: str, repo_name: str) -> List[Dict[str, str]]: """Check and return all issues with creator and status from the past year""" + logger.info("Checking issues...") def extract_issues(result: Dict) -> Tuple[List[Dict[str, str]], Optional[Dict]]: issues = [] page_info = None @@ -377,22 +421,31 @@ def extract_issues(result: Dict) -> Tuple[List[Dict[str, str]], Optional[Dict]]: }) elif created_at and created_at < one_year_ago: # Stop pagination if we've gone past the one-year boundary + logger.debug("Reached issues older than one year, stopping pagination") return issues, None return issues, page_info query = ISSUES_QUERY % (owner, repo_name) issues = paginate_github_query(query, extract_issues) + logger.info(f"Found {len(issues)} issues in the past year") write_issues(issues) return issues def main() -> None: + logger.info(f"Starting metrics check for repository: {OWNER}/{REPO_NAME}") + start_time = datetime.now() + check_root_md_files(OWNER, REPO_NAME) check_license(OWNER, REPO_NAME) check_releases(OWNER, REPO_NAME) check_contributors(OWNER, REPO_NAME) check_commits(OWNER, REPO_NAME) check_issues(OWNER, REPO_NAME) + + end_time = datetime.now() + duration = end_time - start_time + logger.info(f"Metrics check completed in {duration.total_seconds():.2f} seconds") if __name__ == '__main__': main() From 8e99e2bfa9409a98cddac25f70fde4239bddccc0 Mon Sep 17 00:00:00 2001 From: "snowpoke (aider)" Date: Mon, 22 Sep 2025 20:05:30 +0200 Subject: [PATCH 14/36] refactor: Separate concerns into distinct modules for GitHub client, metrics processing, and file writing --- file_writer.py | 49 +++++ github_client.py | 167 ++++++++++++++++ metrics_check.py | 458 ++++--------------------------------------- metrics_processor.py | 228 +++++++++++++++++++++ 4 files changed, 479 insertions(+), 423 deletions(-) create mode 100644 file_writer.py create mode 100644 github_client.py create mode 100644 metrics_processor.py diff --git a/file_writer.py b/file_writer.py new file mode 100644 index 0000000..c805f1f --- /dev/null +++ b/file_writer.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python3 + +import json +import logging +from typing import Dict, List + +logger = logging.getLogger(__name__) + +class FileWriter: + def write_root_md_files(self, md_files: List[str]) -> None: + """Write the .md files in the root folder as JSONL""" + logger.info(f"Writing {len(md_files)} .md files to root_md_files.jsonl") + with open('root_md_files.jsonl', 'w') as f: + for file in md_files: + f.write(json.dumps({"file": file}) + '\n') + + def write_license(self, license_name: str) -> None: + """Write the repository license name as JSONL""" + logger.info(f"Writing license to license.jsonl: {license_name}") + with open('license.jsonl', 'w') as f: + f.write(json.dumps({"license": license_name}) + '\n') + + def write_releases(self, releases: List[Dict[str, str]]) -> None: + """Write all releases with timestamps as JSONL""" + logger.info(f"Writing {len(releases)} releases to releases.jsonl") + with open('releases.jsonl', 'w') as f: + for release in releases: + f.write(json.dumps(release) + '\n') + + def write_contributors(self, contributors: Dict[str, str]) -> None: + """Write all contributors with their most recent contribution date as JSONL""" + logger.info(f"Writing {len(contributors)} contributors to contributors.jsonl") + with open('contributors.jsonl', 'w') as f: + for login, date in contributors.items(): + f.write(json.dumps({"login": login, "last_contribution": date}) + '\n') + + def write_commits(self, commits: List[Dict[str, str]]) -> None: + """Write all commits as JSONL""" + logger.info(f"Writing {len(commits)} commits to commits.jsonl") + with open('commits.jsonl', 'w') as f: + for commit in commits: + f.write(json.dumps(commit) + '\n') + + def write_issues(self, issues: List[Dict[str, str]]) -> None: + """Write all issues with creator and status as JSONL""" + logger.info(f"Writing {len(issues)} issues to issues.jsonl") + with open('issues.jsonl', 'w') as f: + for issue in issues: + f.write(json.dumps(issue) + '\n') diff --git a/github_client.py b/github_client.py new file mode 100644 index 0000000..6655eca --- /dev/null +++ b/github_client.py @@ -0,0 +1,167 @@ +#!/usr/bin/env python3 + +import os +import sys +import requests +import logging +from datetime import datetime, timedelta +from typing import Dict, Optional, Any + +logger = logging.getLogger(__name__) + +class GitHubClient: + def __init__(self): + # Check if required environment variables are set + self.github_token = os.environ.get('GITHUB_TOKEN') + self.owner = os.environ.get('OWNER') + self.repo_name = os.environ.get('REPO_NAME') + + if not all([self.github_token, self.owner, self.repo_name]): + logger.error("Please set GITHUB_TOKEN, OWNER, and REPO_NAME environment variables.") + sys.exit(1) + + # Headers for GraphQL API + self.headers = { + 'Authorization': f'bearer {self.github_token}', + 'Content-Type': 'application/json' + } + + # GraphQL endpoint + self.url = 'https://api.github.com/graphql' + + # Calculate the date for one year ago + self.one_year_ago = (datetime.now() - timedelta(days=365)).strftime('%Y-%m-%dT%H:%M:%SZ') + + # GraphQL Queries + self.ROOT_FILES_QUERY = ''' + { + repository(owner: "%s", name: "%s") { + object(expression: "HEAD:") { + ... on Tree { + entries { + name + } + } + } + } + } + ''' + + self.LICENSE_QUERY = ''' + { + repository(owner: "%s", name: "%s") { + licenseInfo { + name + } + } + } + ''' + + self.RELEASES_QUERY = ''' + query($cursor: String) { + repository(owner: "%s", name: "%s") { + releases(first: 100, orderBy: {field: CREATED_AT, direction: DESC}, after: $cursor) { + edges { + node { + name + publishedAt + } + cursor + } + pageInfo { + hasNextPage + endCursor + } + } + } + } + ''' + + self.CONTRIBUTORS_QUERY = ''' + query($cursor: String, $since: GitTimestamp!) { + repository(owner: "%s", name: "%s") { + defaultBranchRef { + target { + ... on Commit { + history(first: 100, since: $since, after: $cursor) { + nodes { + author { + user { + login + } + } + committedDate + } + pageInfo { + hasNextPage + endCursor + } + } + } + } + } + } + } + ''' + + self.COMMITS_QUERY = ''' + query($cursor: String, $since: GitTimestamp!) { + repository(owner: "%s", name: "%s") { + defaultBranchRef { + target { + ... on Commit { + history(first: 100, since: $since, after: $cursor) { + nodes { + messageHeadline + committedDate + author { + name + } + } + pageInfo { + hasNextPage + endCursor + } + } + } + } + } + } + } + ''' + + self.ISSUES_QUERY = ''' + query($cursor: String) { + repository(owner: "%s", name: "%s") { + issues(first: 100, states: [OPEN, CLOSED], after: $cursor, orderBy: {field: CREATED_AT, direction: DESC}) { + nodes { + title + state + author { + login + } + createdAt + } + pageInfo { + hasNextPage + endCursor + } + } + } + } + ''' + + def run_query(self, query: str, variables: Dict = {}) -> Optional[Dict[Any, Any]]: + """Run a GraphQL query with variables and return the response""" + try: + logger.debug(f"Running query with variables: {variables}") + response = requests.post(self.url, headers=self.headers, json={'query': query, 'variables': variables}) + if response.status_code == 200: + logger.debug("Query successful") + return response.json() + else: + logger.error(f"Query failed with status code {response.status_code}") + return None + except Exception as e: + logger.error(f"Query failed with exception: {e}") + return None diff --git a/metrics_check.py b/metrics_check.py index ed397bd..d94abb2 100644 --- a/metrics_check.py +++ b/metrics_check.py @@ -1,12 +1,11 @@ #!/usr/bin/env python3 -import os import sys -import json -import requests import logging -from datetime import datetime, timedelta -from typing import Dict, List, Optional, Any, Callable, Tuple +from datetime import datetime +from github_client import GitHubClient +from metrics_processor import MetricsProcessor +from file_writer import FileWriter # Set up logging logging.basicConfig( @@ -18,434 +17,47 @@ ) logger = logging.getLogger(__name__) -# Check if required environment variables are set -GITHUB_TOKEN = os.environ.get('GITHUB_TOKEN') -OWNER = os.environ.get('OWNER') -REPO_NAME = os.environ.get('REPO_NAME') - -if not all([GITHUB_TOKEN, OWNER, REPO_NAME]): - logger.error("Please set GITHUB_TOKEN, OWNER, and REPO_NAME environment variables.") - sys.exit(1) - -# Headers for GraphQL API -headers = { - 'Authorization': f'bearer {GITHUB_TOKEN}', - 'Content-Type': 'application/json' -} - -# GraphQL endpoint -url = 'https://api.github.com/graphql' - -# Calculate the date for one year ago -one_year_ago = (datetime.now() - timedelta(days=365)).strftime('%Y-%m-%dT%H:%M:%SZ') - -# GraphQL Queries with pagination and date filtering -ROOT_FILES_QUERY = ''' -{ - repository(owner: "%s", name: "%s") { - object(expression: "HEAD:") { - ... on Tree { - entries { - name - } - } - } - } -} -''' - -LICENSE_QUERY = ''' -{ - repository(owner: "%s", name: "%s") { - licenseInfo { - name - } - } -} -''' - -RELEASES_QUERY = ''' -query($cursor: String) { - repository(owner: "%s", name: "%s") { - releases(first: 100, orderBy: {field: CREATED_AT, direction: DESC}, after: $cursor) { - edges { - node { - name - publishedAt - } - cursor - } - pageInfo { - hasNextPage - endCursor - } - } - } -} -''' - -CONTRIBUTORS_QUERY = ''' -query($cursor: String, $since: GitTimestamp!) { - repository(owner: "%s", name: "%s") { - defaultBranchRef { - target { - ... on Commit { - history(first: 100, since: $since, after: $cursor) { - nodes { - author { - user { - login - } - } - committedDate - } - pageInfo { - hasNextPage - endCursor - } - } - } - } - } - } -} -''' - -COMMITS_QUERY = ''' -query($cursor: String, $since: GitTimestamp!) { - repository(owner: "%s", name: "%s") { - defaultBranchRef { - target { - ... on Commit { - history(first: 100, since: $since, after: $cursor) { - nodes { - messageHeadline - committedDate - author { - name - } - } - pageInfo { - hasNextPage - endCursor - } - } - } - } - } - } -} -''' - -ISSUES_QUERY = ''' -query($cursor: String) { - repository(owner: "%s", name: "%s") { - issues(first: 100, states: [OPEN, CLOSED], after: $cursor, orderBy: {field: CREATED_AT, direction: DESC}) { - nodes { - title - state - author { - login - } - createdAt - } - pageInfo { - hasNextPage - endCursor - } - } - } -} -''' - -def run_query(query: str, variables: Dict = {}) -> Optional[Dict[Any, Any]]: - """Run a GraphQL query with variables and return the response""" +def main() -> None: + logger.info("Starting metrics check") + start_time = datetime.now() + try: - logger.debug(f"Running query with variables: {variables}") - response = requests.post(url, headers=headers, json={'query': query, 'variables': variables}) - if response.status_code == 200: - logger.debug("Query successful") - return response.json() - else: - logger.error(f"Query failed with status code {response.status_code}") - return None - except Exception as e: - logger.error(f"Query failed with exception: {e}") - return None - -def paginate_github_query( - query: str, - extract_function: Callable[[Dict], Tuple[List[Any], Optional[Dict]]], - initial_variables: Optional[Dict] = None -) -> List[Any]: - """Generic pagination function for GitHub GraphQL API""" - if initial_variables is None: - initial_variables = {} + # Initialize components + github_client = GitHubClient() + metrics_processor = MetricsProcessor(github_client) + file_writer = FileWriter() - all_data = [] - has_next_page = True - cursor = None - variables = initial_variables.copy() - - page_count = 0 - while has_next_page: - page_count += 1 - logger.info(f"Fetching page {page_count}...") + owner = github_client.owner + repo_name = github_client.repo_name - if cursor: - variables['cursor'] = cursor - else: - # Remove cursor from variables if it's None - variables.pop('cursor', None) - - result = run_query(query, variables) + logger.info(f"Processing repository: {owner}/{repo_name}") - if not result: - logger.warning("Query returned no result, stopping pagination") - break - - data_batch, page_info = extract_function(result) - all_data.extend(data_batch) + # Process each metric + md_files = metrics_processor.get_root_md_files(owner, repo_name) + file_writer.write_root_md_files(md_files) - logger.debug(f"Retrieved {len(data_batch)} items in this batch") + license_name = metrics_processor.get_license(owner, repo_name) + file_writer.write_license(license_name) - if page_info and page_info.get('hasNextPage'): - cursor = page_info.get('endCursor') - logger.debug(f"Next cursor: {cursor}") - else: - has_next_page = False - - logger.info(f"Pagination complete. Total pages: {page_count}, Total items: {len(all_data)}") - return all_data - -def write_root_md_files(md_files: List[str]) -> None: - """Write the .md files in the root folder as JSONL""" - logger.info(f"Writing {len(md_files)} .md files to root_md_files.jsonl") - with open('root_md_files.jsonl', 'w') as f: - for file in md_files: - f.write(json.dumps({"file": file}) + '\n') - -def write_license(license_name: str) -> None: - """Write the repository license name as JSONL""" - logger.info(f"Writing license to license.jsonl: {license_name}") - with open('license.jsonl', 'w') as f: - f.write(json.dumps({"license": license_name}) + '\n') - -def write_releases(releases: List[Dict[str, str]]) -> None: - """Write all releases with timestamps as JSONL""" - logger.info(f"Writing {len(releases)} releases to releases.jsonl") - with open('releases.jsonl', 'w') as f: - for release in releases: - f.write(json.dumps(release) + '\n') - -def write_contributors(contributors: Dict[str, str]) -> None: - """Write all contributors with their most recent contribution date as JSONL""" - logger.info(f"Writing {len(contributors)} contributors to contributors.jsonl") - with open('contributors.jsonl', 'w') as f: - for login, date in contributors.items(): - f.write(json.dumps({"login": login, "last_contribution": date}) + '\n') - -def write_commits(commits: List[Dict[str, str]]) -> None: - """Write all commits as JSONL""" - logger.info(f"Writing {len(commits)} commits to commits.jsonl") - with open('commits.jsonl', 'w') as f: - for commit in commits: - f.write(json.dumps(commit) + '\n') - -def write_issues(issues: List[Dict[str, str]]) -> None: - """Write all issues with creator and status as JSONL""" - logger.info(f"Writing {len(issues)} issues to issues.jsonl") - with open('issues.jsonl', 'w') as f: - for issue in issues: - f.write(json.dumps(issue) + '\n') - -def check_root_md_files(owner: str, repo_name: str) -> List[str]: - """Check and return all .md files in the root folder""" - logger.info("Checking root .md files...") - query = ROOT_FILES_QUERY % (owner, repo_name) - result = run_query(query) - - md_files = [] - if result and 'data' in result and result['data']['repository']['object']: - entries = result['data']['repository']['object']['entries'] - md_files = [entry['name'] for entry in entries if entry['name'].endswith('.md')] - logger.info(f"Found {len(md_files)} .md files in root") - else: - logger.warning("No .md files found or error occurred") - write_root_md_files(md_files) - return md_files - -def check_license(owner: str, repo_name: str) -> str: - """Check and return the repository license name""" - logger.info("Checking license...") - query = LICENSE_QUERY % (owner, repo_name) - result = run_query(query) - - license_name = 'None' - if result and 'data' in result: - license_info = result['data']['repository']['licenseInfo'] - license_name = license_info['name'] if license_info else 'None' - logger.info(f"License found: {license_name}") - else: - logger.warning("Error retrieving license information") - write_license(license_name) - return license_name - -def check_releases(owner: str, repo_name: str) -> List[Dict[str, str]]: - """Check and return all releases with timestamps from the past year""" - logger.info("Checking releases...") - def extract_releases(result: Dict) -> Tuple[List[Dict[str, str]], Optional[Dict]]: - releases = [] - page_info = None + releases = metrics_processor.get_releases(owner, repo_name) + file_writer.write_releases(releases) - if 'data' in result and result['data']['repository']: - repo_data = result['data']['repository'] - if repo_data.get('releases'): - release_edges = repo_data['releases']['edges'] - page_info = repo_data['releases']['pageInfo'] - - # Filter releases from the past year - for edge in release_edges: - published_at = edge['node']['publishedAt'] - if published_at and published_at >= one_year_ago: - releases.append({ - 'name': edge['node']['name'] or 'Unnamed release', - 'publishedAt': published_at - }) - elif published_at and published_at < one_year_ago: - # Stop pagination if we've gone past the one-year boundary - logger.debug("Reached releases older than one year, stopping pagination") - return releases, None - - return releases, page_info - - query = RELEASES_QUERY % (owner, repo_name) - releases = paginate_github_query(query, extract_releases) - logger.info(f"Found {len(releases)} releases in the past year") - write_releases(releases) - return releases - -def check_contributors(owner: str, repo_name: str) -> Dict[str, str]: - """Check and return all contributors with their most recent contribution date from the past year""" - logger.info("Checking contributors...") - def extract_contributors(result: Dict) -> Tuple[List[Dict[str, str]], Optional[Dict]]: - contributors: Dict[str, str] = {} - page_info = None + contributors = metrics_processor.get_contributors(owner, repo_name) + file_writer.write_contributors(contributors) - if 'data' in result and result['data']['repository']: - repo_data = result['data']['repository'] - if repo_data.get('defaultBranchRef') and repo_data['defaultBranchRef'].get('target'): - target = repo_data['defaultBranchRef']['target'] - if target.get('history'): - history = target['history'] - commit_nodes = history['nodes'] - page_info = history['pageInfo'] - - for commit in commit_nodes: - if commit.get('author') and commit['author'].get('user'): - login = commit['author']['user']['login'] - date = commit['committedDate'] - if login and date: - if login not in contributors or date > contributors[login]: - contributors[login] = date - - return [contributors], page_info - - query = CONTRIBUTORS_QUERY % (owner, repo_name) - contributor_list = paginate_github_query(query, extract_contributors, {'since': one_year_ago}) - - # Merge all contributor dictionaries - final_contributors: Dict[str, str] = {} - for contributors in contributor_list: - for login, date in contributors.items(): - if login not in final_contributors or date > final_contributors[login]: - final_contributors[login] = date - - logger.info(f"Found {len(final_contributors)} contributors in the past year") - write_contributors(final_contributors) - return final_contributors - -def check_commits(owner: str, repo_name: str) -> List[Dict[str, str]]: - """Check and return all commits from the past year""" - logger.info("Checking commits...") - def extract_commits(result: Dict) -> Tuple[List[Dict[str, str]], Optional[Dict]]: - commits = [] - page_info = None + commits = metrics_processor.get_commits(owner, repo_name) + file_writer.write_commits(commits) - if 'data' in result and result['data']['repository']: - repo_data = result['data']['repository'] - if repo_data.get('defaultBranchRef') and repo_data['defaultBranchRef'].get('target'): - target = repo_data['defaultBranchRef']['target'] - if target.get('history'): - history = target['history'] - commit_nodes = history['nodes'] - page_info = history['pageInfo'] - - for commit in commit_nodes: - commits.append({ - 'message': commit.get('messageHeadline', ''), - 'date': commit.get('committedDate', ''), - 'author': commit.get('author', {}).get('name', 'Unknown') if commit.get('author') else 'Unknown' - }) - - return commits, page_info - - query = COMMITS_QUERY % (owner, repo_name) - commits = paginate_github_query(query, extract_commits, {'since': one_year_ago}) - logger.info(f"Found {len(commits)} commits in the past year") - write_commits(commits) - return commits - -def check_issues(owner: str, repo_name: str) -> List[Dict[str, str]]: - """Check and return all issues with creator and status from the past year""" - logger.info("Checking issues...") - def extract_issues(result: Dict) -> Tuple[List[Dict[str, str]], Optional[Dict]]: - issues = [] - page_info = None + issues = metrics_processor.get_issues(owner, repo_name) + file_writer.write_issues(issues) - if 'data' in result and result['data']['repository']: - repo_data = result['data']['repository'] - if repo_data.get('issues'): - issue_nodes = repo_data['issues']['nodes'] - page_info = repo_data['issues']['pageInfo'] - - # Filter issues from the past year - for issue in issue_nodes: - created_at = issue.get('createdAt') - if created_at and created_at >= one_year_ago: - issues.append({ - 'title': issue.get('title', ''), - 'state': issue.get('state', ''), - 'author': issue.get('author', {}).get('login', 'Unknown') if issue.get('author') else 'Unknown', - 'createdAt': created_at - }) - elif created_at and created_at < one_year_ago: - # Stop pagination if we've gone past the one-year boundary - logger.debug("Reached issues older than one year, stopping pagination") - return issues, None - - return issues, page_info - - query = ISSUES_QUERY % (owner, repo_name) - issues = paginate_github_query(query, extract_issues) - logger.info(f"Found {len(issues)} issues in the past year") - write_issues(issues) - return issues - -def main() -> None: - logger.info(f"Starting metrics check for repository: {OWNER}/{REPO_NAME}") - start_time = datetime.now() - - check_root_md_files(OWNER, REPO_NAME) - check_license(OWNER, REPO_NAME) - check_releases(OWNER, REPO_NAME) - check_contributors(OWNER, REPO_NAME) - check_commits(OWNER, REPO_NAME) - check_issues(OWNER, REPO_NAME) - - end_time = datetime.now() - duration = end_time - start_time - logger.info(f"Metrics check completed in {duration.total_seconds():.2f} seconds") + end_time = datetime.now() + duration = end_time - start_time + logger.info(f"Metrics check completed in {duration.total_seconds():.2f} seconds") + + except Exception as e: + logger.error(f"Application failed: {e}") + sys.exit(1) if __name__ == '__main__': main() diff --git a/metrics_processor.py b/metrics_processor.py new file mode 100644 index 0000000..474ccd3 --- /dev/null +++ b/metrics_processor.py @@ -0,0 +1,228 @@ +#!/usr/bin/env python3 + +import logging +from typing import Dict, List, Tuple, Optional, Any +from github_client import GitHubClient + +logger = logging.getLogger(__name__) + +class MetricsProcessor: + def __init__(self, github_client: GitHubClient): + self.github_client = github_client + + def get_root_md_files(self, owner: str, repo_name: str) -> List[str]: + """Get all .md files in the root folder""" + logger.info("Checking root .md files...") + query = self.github_client.ROOT_FILES_QUERY % (owner, repo_name) + result = self.github_client.run_query(query) + + md_files = [] + if result and 'data' in result and result['data']['repository']['object']: + entries = result['data']['repository']['object']['entries'] + md_files = [entry['name'] for entry in entries if entry['name'].endswith('.md')] + logger.info(f"Found {len(md_files)} .md files in root") + else: + logger.warning("No .md files found or error occurred") + return md_files + + def get_license(self, owner: str, repo_name: str) -> str: + """Get the repository license name""" + logger.info("Checking license...") + query = self.github_client.LICENSE_QUERY % (owner, repo_name) + result = self.github_client.run_query(query) + + license_name = 'None' + if result and 'data' in result: + license_info = result['data']['repository']['licenseInfo'] + license_name = license_info['name'] if license_info else 'None' + logger.info(f"License found: {license_name}") + else: + logger.warning("Error retrieving license information") + return license_name + + def _extract_releases(self, result: Dict) -> Tuple[List[Dict[str, str]], Optional[Dict]]: + """Extract releases from GraphQL response""" + releases = [] + page_info = None + + if 'data' in result and result['data']['repository']: + repo_data = result['data']['repository'] + if repo_data.get('releases'): + release_edges = repo_data['releases']['edges'] + page_info = repo_data['releases']['pageInfo'] + + # Filter releases from the past year + for edge in release_edges: + published_at = edge['node']['publishedAt'] + if published_at and published_at >= self.github_client.one_year_ago: + releases.append({ + 'name': edge['node']['name'] or 'Unnamed release', + 'publishedAt': published_at + }) + elif published_at and published_at < self.github_client.one_year_ago: + # Stop pagination if we've gone past the one-year boundary + logger.debug("Reached releases older than one year, stopping pagination") + return releases, None + + return releases, page_info + + def get_releases(self, owner: str, repo_name: str) -> List[Dict[str, str]]: + """Get all releases with timestamps from the past year""" + logger.info("Checking releases...") + query = self.github_client.RELEASES_QUERY % (owner, repo_name) + releases = self._paginate_github_query(query, self._extract_releases) + logger.info(f"Found {len(releases)} releases in the past year") + return releases + + def _extract_contributors(self, result: Dict) -> Tuple[List[Dict[str, str]], Optional[Dict]]: + """Extract contributors from GraphQL response""" + contributors: Dict[str, str] = {} + page_info = None + + if 'data' in result and result['data']['repository']: + repo_data = result['data']['repository'] + if repo_data.get('defaultBranchRef') and repo_data['defaultBranchRef'].get('target'): + target = repo_data['defaultBranchRef']['target'] + if target.get('history'): + history = target['history'] + commit_nodes = history['nodes'] + page_info = history['pageInfo'] + + for commit in commit_nodes: + if commit.get('author') and commit['author'].get('user'): + login = commit['author']['user']['login'] + date = commit['committedDate'] + if login and date: + if login not in contributors or date > contributors[login]: + contributors[login] = date + + return [contributors], page_info + + def get_contributors(self, owner: str, repo_name: str) -> Dict[str, str]: + """Get all contributors with their most recent contribution date from the past year""" + logger.info("Checking contributors...") + query = self.github_client.CONTRIBUTORS_QUERY % (owner, repo_name) + contributor_list = self._paginate_github_query(query, self._extract_contributors, {'since': self.github_client.one_year_ago}) + + # Merge all contributor dictionaries + final_contributors: Dict[str, str] = {} + for contributors in contributor_list: + for login, date in contributors.items(): + if login not in final_contributors or date > final_contributors[login]: + final_contributors[login] = date + + logger.info(f"Found {len(final_contributors)} contributors in the past year") + return final_contributors + + def _extract_commits(self, result: Dict) -> Tuple[List[Dict[str, str]], Optional[Dict]]: + """Extract commits from GraphQL response""" + commits = [] + page_info = None + + if 'data' in result and result['data']['repository']: + repo_data = result['data']['repository'] + if repo_data.get('defaultBranchRef') and repo_data['defaultBranchRef'].get('target'): + target = repo_data['defaultBranchRef']['target'] + if target.get('history'): + history = target['history'] + commit_nodes = history['nodes'] + page_info = history['pageInfo'] + + for commit in commit_nodes: + commits.append({ + 'message': commit.get('messageHeadline', ''), + 'date': commit.get('committedDate', ''), + 'author': commit.get('author', {}).get('name', 'Unknown') if commit.get('author') else 'Unknown' + }) + + return commits, page_info + + def get_commits(self, owner: str, repo_name: str) -> List[Dict[str, str]]: + """Get all commits from the past year""" + logger.info("Checking commits...") + query = self.github_client.COMMITS_QUERY % (owner, repo_name) + commits = self._paginate_github_query(query, self._extract_commits, {'since': self.github_client.one_year_ago}) + logger.info(f"Found {len(commits)} commits in the past year") + return commits + + def _extract_issues(self, result: Dict) -> Tuple[List[Dict[str, str]], Optional[Dict]]: + """Extract issues from GraphQL response""" + issues = [] + page_info = None + + if 'data' in result and result['data']['repository']: + repo_data = result['data']['repository'] + if repo_data.get('issues'): + issue_nodes = repo_data['issues']['nodes'] + page_info = repo_data['issues']['pageInfo'] + + # Filter issues from the past year + for issue in issue_nodes: + created_at = issue.get('createdAt') + if created_at and created_at >= self.github_client.one_year_ago: + issues.append({ + 'title': issue.get('title', ''), + 'state': issue.get('state', ''), + 'author': issue.get('author', {}).get('login', 'Unknown') if issue.get('author') else 'Unknown', + 'createdAt': created_at + }) + elif created_at and created_at < self.github_client.one_year_ago: + # Stop pagination if we've gone past the one-year boundary + logger.debug("Reached issues older than one year, stopping pagination") + return issues, None + + return issues, page_info + + def get_issues(self, owner: str, repo_name: str) -> List[Dict[str, str]]: + """Get all issues with creator and status from the past year""" + logger.info("Checking issues...") + query = self.github_client.ISSUES_QUERY % (owner, repo_name) + issues = self._paginate_github_query(query, self._extract_issues) + logger.info(f"Found {len(issues)} issues in the past year") + return issues + + def _paginate_github_query( + self, + query: str, + extract_function, + initial_variables: Optional[Dict] = None + ) -> List[Any]: + """Generic pagination function for GitHub GraphQL API""" + if initial_variables is None: + initial_variables = {} + + all_data = [] + has_next_page = True + cursor = None + variables = initial_variables.copy() + + page_count = 0 + while has_next_page: + page_count += 1 + logger.info(f"Fetching page {page_count}...") + + if cursor: + variables['cursor'] = cursor + else: + # Remove cursor from variables if it's None + variables.pop('cursor', None) + + result = self.github_client.run_query(query, variables) + + if not result: + logger.warning("Query returned no result, stopping pagination") + break + + data_batch, page_info = extract_function(result) + all_data.extend(data_batch) + + logger.debug(f"Retrieved {len(data_batch)} items in this batch") + + if page_info and page_info.get('hasNextPage'): + cursor = page_info.get('endCursor') + logger.debug(f"Next cursor: {cursor}") + else: + has_next_page = False + + logger.info(f"Pagination complete. Total pages: {page_count}, Total items: {len(all_data)}") + return all_data From a372d048fbe4ac1d23ba0f885f0d78dcdfbba879 Mon Sep 17 00:00:00 2001 From: "snowpoke (aider)" Date: Mon, 22 Sep 2025 20:15:03 +0200 Subject: [PATCH 15/36] feat: add YAML configuration support for GitHub client and update metrics check to use config file --- config.yaml | 2 ++ github_client.py | 24 ++++++++++++++++++++---- 2 files changed, 22 insertions(+), 4 deletions(-) create mode 100644 config.yaml diff --git a/config.yaml b/config.yaml new file mode 100644 index 0000000..fe9ab12 --- /dev/null +++ b/config.yaml @@ -0,0 +1,2 @@ +owner: "duckdb" +repo_name: "duckdb" diff --git a/github_client.py b/github_client.py index 6655eca..d917359 100644 --- a/github_client.py +++ b/github_client.py @@ -4,6 +4,7 @@ import sys import requests import logging +import yaml from datetime import datetime, timedelta from typing import Dict, Optional, Any @@ -13,11 +14,26 @@ class GitHubClient: def __init__(self): # Check if required environment variables are set self.github_token = os.environ.get('GITHUB_TOKEN') - self.owner = os.environ.get('OWNER') - self.repo_name = os.environ.get('REPO_NAME') - if not all([self.github_token, self.owner, self.repo_name]): - logger.error("Please set GITHUB_TOKEN, OWNER, and REPO_NAME environment variables.") + # Load owner and repo_name from config.yaml + try: + with open('config.yaml', 'r') as f: + config = yaml.safe_load(f) + self.owner = config.get('owner') + self.repo_name = config.get('repo_name') + except FileNotFoundError: + logger.error("config.yaml file not found.") + sys.exit(1) + except yaml.YAMLError as e: + logger.error(f"Error parsing config.yaml: {e}") + sys.exit(1) + + if not self.github_token: + logger.error("Please set GITHUB_TOKEN environment variable.") + sys.exit(1) + + if not self.owner or not self.repo_name: + logger.error("Please ensure config.yaml contains 'owner' and 'repo_name' keys.") sys.exit(1) # Headers for GraphQL API From ed7f754c9dc2e7a38d8f5d1bdfa9489e7ba34e36 Mon Sep 17 00:00:00 2001 From: snowpoke Date: Mon, 22 Sep 2025 20:16:53 +0200 Subject: [PATCH 16/36] feat: add requirements.txt file --- requirements.txt | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..9ce2269 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +requests +yaml From 13c85a03e4958953a4ccb066ba4b3ecdad50dee4 Mon Sep 17 00:00:00 2001 From: "snowpoke (aider)" Date: Mon, 22 Sep 2025 20:16:55 +0200 Subject: [PATCH 17/36] fix: add PyYAML to requirements and handle import error gracefully --- github_client.py | 7 +++++-- requirements.txt | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/github_client.py b/github_client.py index d917359..7d518c5 100644 --- a/github_client.py +++ b/github_client.py @@ -4,7 +4,6 @@ import sys import requests import logging -import yaml from datetime import datetime, timedelta from typing import Dict, Optional, Any @@ -17,6 +16,7 @@ def __init__(self): # Load owner and repo_name from config.yaml try: + import yaml with open('config.yaml', 'r') as f: config = yaml.safe_load(f) self.owner = config.get('owner') @@ -24,7 +24,10 @@ def __init__(self): except FileNotFoundError: logger.error("config.yaml file not found.") sys.exit(1) - except yaml.YAMLError as e: + except ImportError: + logger.error("PyYAML is not installed. Please install it with: pip install PyYAML") + sys.exit(1) + except Exception as e: logger.error(f"Error parsing config.yaml: {e}") sys.exit(1) diff --git a/requirements.txt b/requirements.txt index 9ce2269..33f05f7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,2 @@ requests -yaml +PyYAML From eabf74ed32661938b8b333eb16c59b825e227951 Mon Sep 17 00:00:00 2001 From: snowpoke Date: Mon, 22 Sep 2025 21:41:51 +0200 Subject: [PATCH 18/36] fix: use smaller test repo --- .gitignore | 6 ++ config.yaml | 2 +- metrics_check.sh | 142 ----------------------------------------------- 3 files changed, 7 insertions(+), 143 deletions(-) delete mode 100755 metrics_check.sh diff --git a/.gitignore b/.gitignore index aa54bbb..1342bb7 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,9 @@ .local .workspace .aider* +.env +.evidence +__pycache__ +*.jsonl +node_modules +venv diff --git a/config.yaml b/config.yaml index fe9ab12..6ededa6 100644 --- a/config.yaml +++ b/config.yaml @@ -1,2 +1,2 @@ owner: "duckdb" -repo_name: "duckdb" +repo_name: "duckdb-wasm" diff --git a/metrics_check.sh b/metrics_check.sh deleted file mode 100755 index 916beb9..0000000 --- a/metrics_check.sh +++ /dev/null @@ -1,142 +0,0 @@ -#!/bin/bash - -# Check if required environment variables are set -if [ -z "$GITHUB_TOKEN" ]; then - echo "Error: GITHUB_TOKEN environment variable is not set." - exit 1 -fi - -if [ -z "$OWNER" ]; then - echo "Error: OWNER environment variable is not set." - exit 1 -fi - -if [ -z "$REPO_NAME" ]; then - echo "Error: REPO_NAME environment variable is not set." - exit 1 -fi - -# Calculate the date one year ago in ISO 8601 format -SINCE_DATE=$(date -u -d "365 days ago" +"%Y-%m-%dT%H:%M:%SZ" 2>/dev/null || date -v-365d -u +"%Y-%m-%dT%H:%M:%SZ") -echo "Since date: $SINCE_DATE" - -echo "Checking metrics for repository: $OWNER/$REPO_NAME" -echo "==================================================" - -# 1. List all .md files in the root folder -echo "1. Listing all .md files in the root folder..." -QUERY1='{ - "query": "query { repository(owner: \"'"$OWNER"'\", name: \"'"$REPO_NAME"'\") { object(expression: \"HEAD:\") { ... on Tree { entries { name } } } } }" -}' -echo " Query: $QUERY1" -ROOT_FILES_RESPONSE=$(curl -s -H "Authorization: bearer $GITHUB_TOKEN" \ - -H "Content-Type: application/json" \ - -d "$QUERY1" https://api.github.com/graphql) -echo " Raw response: $ROOT_FILES_RESPONSE" - -# Extract .md files -echo " .md files in root:" -echo "$ROOT_FILES_RESPONSE" | grep -o '"name":"[^"]*"' | cut -d'"' -f4 | grep '\.md$' | while read -r file; do - echo " - $file" -done -echo - -# 2. Get license name -echo "2. Getting license name..." -QUERY2='{ - "query": "query { repository(owner: \"'"$OWNER"'\", name: \"'"$REPO_NAME"'\") { licenseInfo { name } } }" -}' -echo " Query: $QUERY2" -LICENSE_RESPONSE=$(curl -s -H "Authorization: bearer $GITHUB_TOKEN" \ - -H "Content-Type: application/json" \ - -d "$QUERY2" https://api.github.com/graphql) -echo " Raw response: $LICENSE_RESPONSE" - -# Extract license name -LICENSE_NAME=$(echo "$LICENSE_RESPONSE" | grep -o '"name":"[^"]*"' | cut -d'"' -f4) -echo " License name: ${LICENSE_NAME:-None}" -echo - -# 3. List all releases with timestamps -echo "3. Listing all releases with timestamps..." -QUERY3='{ - "query": "query { repository(owner: \"'"$OWNER"'\", name: \"'"$REPO_NAME"'\") { releases(last: 100, orderBy: {field: CREATED_AT, direction: DESC}) { edges { node { name publishedAt } } } } }" -}' -echo " Query: $QUERY3" -RELEASES_RESPONSE=$(curl -s -H "Authorization: bearer $GITHUB_TOKEN" \ - -H "Content-Type: application/json" \ - -d "$QUERY3" https://api.github.com/graphql) -echo " Raw response: $RELEASES_RESPONSE" - -# Extract releases -echo " Releases:" -echo "$RELEASES_RESPONSE" | grep -E '"name":|"publishedAt":' | while read -r line1 && read -r line2; do - name=$(echo "$line1" | grep -o '"name":"[^"]*"' | cut -d'"' -f4) - date=$(echo "$line2" | grep -o '"publishedAt":"[^"]*"' | cut -d'"' -f4) - echo " - $name: $date" -done -echo - -# 4. List all contributors with their most recent contribution date -echo "4. Listing all contributors with their most recent contribution date..." -QUERY4='{ - "query": "query { repository(owner: \"'"$OWNER"'\", name: \"'"$REPO_NAME"'\") { defaultBranchRef { target { ... on Commit { history(first: 100) { nodes { author { user { login } } committedDate } } } } } } }" -}' -echo " Query: $QUERY4" -CONTRIBUTORS_RESPONSE=$(curl -s -H "Authorization: bearer $GITHUB_TOKEN" \ - -H "Content-Type: application/json" \ - -d "$QUERY4" https://api.github.com/graphql) -echo " Raw response: $CONTRIBUTORS_RESPONSE" - -# Extract contributors and their latest commit dates -echo " Contributors and their most recent contribution:" -# This is a simplified approach - in practice, you'd want to process this more carefully -echo "$CONTRIBUTORS_RESPONSE" | grep -E '"login":|"committedDate":' | while read -r line1 && read -r line2; do - login=$(echo "$line1" | grep -o '"login":"[^"]*"' | cut -d'"' -f4) - date=$(echo "$line2" | grep -o '"committedDate":"[^"]*"' | cut -d'"' -f4) - if [ -n "$login" ] && [ -n "$date" ]; then - echo " - $login: $date" - fi -done -echo - -# 5. List all commits -echo "5. Listing all commits..." -QUERY5='{ - "query": "query { repository(owner: \"'"$OWNER"'\", name: \"'"$REPO_NAME"'\") { defaultBranchRef { target { ... on Commit { history(first: 100) { nodes { messageHeadline committedDate author { name } } } } } } } }" -}' -echo " Query: $QUERY5" -COMMITS_RESPONSE=$(curl -s -H "Authorization: bearer $GITHUB_TOKEN" \ - -H "Content-Type: application/json" \ - -d "$QUERY5" https://api.github.com/graphql) -echo " Raw response: $COMMITS_RESPONSE" - -# Extract commits -echo " Commits:" -echo "$COMMITS_RESPONSE" | grep -E '"messageHeadline":|"committedDate":|"name":' | while read -r line1 && read -r line2 && read -r line3; do - message=$(echo "$line1" | grep -o '"messageHeadline":"[^"]*"' | cut -d'"' -f4) - date=$(echo "$line2" | grep -o '"committedDate":"[^"]*"' | cut -d'"' -f4) - author=$(echo "$line3" | grep -o '"name":"[^"]*"' | cut -d'"' -f4) - echo " - $date: $author - $message" -done -echo - -# 6. List all issues with creator and status -echo "6. Listing all issues with creator and status..." -QUERY6='{ - "query": "query { repository(owner: \"'"$OWNER"'\", name: \"'"$REPO_NAME"'\") { issues(first: 100, states: [OPEN, CLOSED]) { nodes { title state author { login } } } } }" -}' -echo " Query: $QUERY6" -ISSUES_RESPONSE=$(curl -s -H "Authorization: bearer $GITHUB_TOKEN" \ - -H "Content-Type: application/json" \ - -d "$QUERY6" https://api.github.com/graphql) -echo " Raw response: $ISSUES_RESPONSE" - -# Extract issues -echo " Issues:" -echo "$ISSUES_RESPONSE" | grep -E '"title":|"state":|"login":' | while read -r line1 && read -r line2 && read -r line3; do - title=$(echo "$line1" | grep -o '"title":"[^"]*"' | cut -d'"' -f4) - state=$(echo "$line2" | grep -o '"state":"[^"]*"' | cut -d'"' -f4) - author=$(echo "$line3" | grep -o '"login":"[^"]*"' | cut -d'"' -f4) - echo " - $state: $author - $title" -done From dddd9ec1fa147972cbe5498986341406ceefacca Mon Sep 17 00:00:00 2001 From: "snowpoke (aider)" Date: Mon, 22 Sep 2025 21:53:50 +0200 Subject: [PATCH 19/36] refactor: extract configuration management into separate class --- config.py | 33 +++++++++++++++++++++++++++++++++ github_client.py | 21 --------------------- metrics_check.py | 6 ++++-- 3 files changed, 37 insertions(+), 23 deletions(-) create mode 100644 config.py diff --git a/config.py b/config.py new file mode 100644 index 0000000..52c2e6f --- /dev/null +++ b/config.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python3 + +import yaml +import sys +import logging +from typing import Optional + +logger = logging.getLogger(__name__) + +class Configuration: + def __init__(self, config_file: str = 'config.yaml'): + self.owner: Optional[str] = None + self.repo_name: Optional[str] = None + self._load_config(config_file) + + def _load_config(self, config_file: str) -> None: + """Load configuration from YAML file""" + try: + with open(config_file, 'r') as f: + config = yaml.safe_load(f) + self.owner = config.get('owner') + self.repo_name = config.get('repo_name') + + if not self.owner or not self.repo_name: + logger.error("Please ensure config.yaml contains 'owner' and 'repo_name' keys.") + sys.exit(1) + + except FileNotFoundError: + logger.error(f"{config_file} file not found.") + sys.exit(1) + except Exception as e: + logger.error(f"Error parsing {config_file}: {e}") + sys.exit(1) diff --git a/github_client.py b/github_client.py index 7d518c5..8a6314f 100644 --- a/github_client.py +++ b/github_client.py @@ -14,30 +14,9 @@ def __init__(self): # Check if required environment variables are set self.github_token = os.environ.get('GITHUB_TOKEN') - # Load owner and repo_name from config.yaml - try: - import yaml - with open('config.yaml', 'r') as f: - config = yaml.safe_load(f) - self.owner = config.get('owner') - self.repo_name = config.get('repo_name') - except FileNotFoundError: - logger.error("config.yaml file not found.") - sys.exit(1) - except ImportError: - logger.error("PyYAML is not installed. Please install it with: pip install PyYAML") - sys.exit(1) - except Exception as e: - logger.error(f"Error parsing config.yaml: {e}") - sys.exit(1) - if not self.github_token: logger.error("Please set GITHUB_TOKEN environment variable.") sys.exit(1) - - if not self.owner or not self.repo_name: - logger.error("Please ensure config.yaml contains 'owner' and 'repo_name' keys.") - sys.exit(1) # Headers for GraphQL API self.headers = { diff --git a/metrics_check.py b/metrics_check.py index d94abb2..67b7454 100644 --- a/metrics_check.py +++ b/metrics_check.py @@ -6,6 +6,7 @@ from github_client import GitHubClient from metrics_processor import MetricsProcessor from file_writer import FileWriter +from config import Configuration # Set up logging logging.basicConfig( @@ -23,12 +24,13 @@ def main() -> None: try: # Initialize components + config = Configuration() github_client = GitHubClient() metrics_processor = MetricsProcessor(github_client) file_writer = FileWriter() - owner = github_client.owner - repo_name = github_client.repo_name + owner = config.owner + repo_name = config.repo_name logger.info(f"Processing repository: {owner}/{repo_name}") From 632fb7c46c9c3340abba55c6579035f84bd02eac Mon Sep 17 00:00:00 2001 From: "snowpoke (aider)" Date: Mon, 22 Sep 2025 22:01:02 +0200 Subject: [PATCH 20/36] feat: make config file name configurable via parameter --- config.py | 1 + 1 file changed, 1 insertion(+) diff --git a/config.py b/config.py index 52c2e6f..ddcba34 100644 --- a/config.py +++ b/config.py @@ -11,6 +11,7 @@ class Configuration: def __init__(self, config_file: str = 'config.yaml'): self.owner: Optional[str] = None self.repo_name: Optional[str] = None + self.config_file: str = config_file self._load_config(config_file) def _load_config(self, config_file: str) -> None: From 348e9a7fb68099de8a76bd4f0d47ca6200b2d10b Mon Sep 17 00:00:00 2001 From: "snowpoke (aider)" Date: Mon, 22 Sep 2025 22:02:13 +0200 Subject: [PATCH 21/36] feat: make output file names and directory paths configurable --- config.py | 14 ++++++++++++++ config.yaml | 7 +++++++ file_writer.py | 39 +++++++++++++++++++++++++++------------ 3 files changed, 48 insertions(+), 12 deletions(-) diff --git a/config.py b/config.py index ddcba34..78afc01 100644 --- a/config.py +++ b/config.py @@ -12,6 +12,13 @@ def __init__(self, config_file: str = 'config.yaml'): self.owner: Optional[str] = None self.repo_name: Optional[str] = None self.config_file: str = config_file + self.output_dir: str = '.' + self.root_md_files_output: str = 'root_md_files.jsonl' + self.license_output: str = 'license.jsonl' + self.releases_output: str = 'releases.jsonl' + self.contributors_output: str = 'contributors.jsonl' + self.commits_output: str = 'commits.jsonl' + self.issues_output: str = 'issues.jsonl' self._load_config(config_file) def _load_config(self, config_file: str) -> None: @@ -21,6 +28,13 @@ def _load_config(self, config_file: str) -> None: config = yaml.safe_load(f) self.owner = config.get('owner') self.repo_name = config.get('repo_name') + self.output_dir = config.get('output_dir', '.') + self.root_md_files_output = config.get('root_md_files_output', 'root_md_files.jsonl') + self.license_output = config.get('license_output', 'license.jsonl') + self.releases_output = config.get('releases_output', 'releases.jsonl') + self.contributors_output = config.get('contributors_output', 'contributors.jsonl') + self.commits_output = config.get('commits_output', 'commits.jsonl') + self.issues_output = config.get('issues_output', 'issues.jsonl') if not self.owner or not self.repo_name: logger.error("Please ensure config.yaml contains 'owner' and 'repo_name' keys.") diff --git a/config.yaml b/config.yaml index 6ededa6..2ee829a 100644 --- a/config.yaml +++ b/config.yaml @@ -1,2 +1,9 @@ owner: "duckdb" repo_name: "duckdb-wasm" +output_dir: "." +root_md_files_output: "root_md_files.jsonl" +license_output: "license.jsonl" +releases_output: "releases.jsonl" +contributors_output: "contributors.jsonl" +commits_output: "commits.jsonl" +issues_output: "issues.jsonl" diff --git a/file_writer.py b/file_writer.py index c805f1f..c2ece4f 100644 --- a/file_writer.py +++ b/file_writer.py @@ -2,48 +2,63 @@ import json import logging +import os from typing import Dict, List +from config import Configuration logger = logging.getLogger(__name__) class FileWriter: + def __init__(self, config: Configuration): + self.config = config + + def _get_output_path(self, filename: str) -> str: + """Get full path for output file""" + return os.path.join(self.config.output_dir, filename) + def write_root_md_files(self, md_files: List[str]) -> None: """Write the .md files in the root folder as JSONL""" - logger.info(f"Writing {len(md_files)} .md files to root_md_files.jsonl") - with open('root_md_files.jsonl', 'w') as f: + output_file = self._get_output_path(self.config.root_md_files_output) + logger.info(f"Writing {len(md_files)} .md files to {output_file}") + with open(output_file, 'w') as f: for file in md_files: f.write(json.dumps({"file": file}) + '\n') def write_license(self, license_name: str) -> None: """Write the repository license name as JSONL""" - logger.info(f"Writing license to license.jsonl: {license_name}") - with open('license.jsonl', 'w') as f: + output_file = self._get_output_path(self.config.license_output) + logger.info(f"Writing license to {output_file}: {license_name}") + with open(output_file, 'w') as f: f.write(json.dumps({"license": license_name}) + '\n') def write_releases(self, releases: List[Dict[str, str]]) -> None: """Write all releases with timestamps as JSONL""" - logger.info(f"Writing {len(releases)} releases to releases.jsonl") - with open('releases.jsonl', 'w') as f: + output_file = self._get_output_path(self.config.releases_output) + logger.info(f"Writing {len(releases)} releases to {output_file}") + with open(output_file, 'w') as f: for release in releases: f.write(json.dumps(release) + '\n') def write_contributors(self, contributors: Dict[str, str]) -> None: """Write all contributors with their most recent contribution date as JSONL""" - logger.info(f"Writing {len(contributors)} contributors to contributors.jsonl") - with open('contributors.jsonl', 'w') as f: + output_file = self._get_output_path(self.config.contributors_output) + logger.info(f"Writing {len(contributors)} contributors to {output_file}") + with open(output_file, 'w') as f: for login, date in contributors.items(): f.write(json.dumps({"login": login, "last_contribution": date}) + '\n') def write_commits(self, commits: List[Dict[str, str]]) -> None: """Write all commits as JSONL""" - logger.info(f"Writing {len(commits)} commits to commits.jsonl") - with open('commits.jsonl', 'w') as f: + output_file = self._get_output_path(self.config.commits_output) + logger.info(f"Writing {len(commits)} commits to {output_file}") + with open(output_file, 'w') as f: for commit in commits: f.write(json.dumps(commit) + '\n') def write_issues(self, issues: List[Dict[str, str]]) -> None: """Write all issues with creator and status as JSONL""" - logger.info(f"Writing {len(issues)} issues to issues.jsonl") - with open('issues.jsonl', 'w') as f: + output_file = self._get_output_path(self.config.issues_output) + logger.info(f"Writing {len(issues)} issues to {output_file}") + with open(output_file, 'w') as f: for issue in issues: f.write(json.dumps(issue) + '\n') From 074ccd7d9fb2a86f4da3f932c7fb24f0e6721237 Mon Sep 17 00:00:00 2001 From: "snowpoke (aider)" Date: Mon, 22 Sep 2025 22:05:53 +0200 Subject: [PATCH 22/36] refactor: Make GitHub API configuration values configurable through Configuration class --- config.py | 13 ++++ config.yaml | 6 ++ github_client.py | 152 +++++++++++++++++++++++-------------------- metrics_processor.py | 42 ++++++------ 4 files changed, 121 insertions(+), 92 deletions(-) diff --git a/config.py b/config.py index 78afc01..afd5b19 100644 --- a/config.py +++ b/config.py @@ -19,6 +19,13 @@ def __init__(self, config_file: str = 'config.yaml'): self.contributors_output: str = 'contributors.jsonl' self.commits_output: str = 'commits.jsonl' self.issues_output: str = 'issues.jsonl' + + # GitHub API configuration + self.github_api_url: str = 'https://api.github.com/graphql' + self.pagination_limit: int = 100 + self.date_range_days: int = 365 + self.request_timeout: int = 30 + self._load_config(config_file) def _load_config(self, config_file: str) -> None: @@ -36,6 +43,12 @@ def _load_config(self, config_file: str) -> None: self.commits_output = config.get('commits_output', 'commits.jsonl') self.issues_output = config.get('issues_output', 'issues.jsonl') + # GitHub API configuration + self.github_api_url = config.get('github_api_url', 'https://api.github.com/graphql') + self.pagination_limit = config.get('pagination_limit', 100) + self.date_range_days = config.get('date_range_days', 365) + self.request_timeout = config.get('request_timeout', 30) + if not self.owner or not self.repo_name: logger.error("Please ensure config.yaml contains 'owner' and 'repo_name' keys.") sys.exit(1) diff --git a/config.yaml b/config.yaml index 2ee829a..582856d 100644 --- a/config.yaml +++ b/config.yaml @@ -7,3 +7,9 @@ releases_output: "releases.jsonl" contributors_output: "contributors.jsonl" commits_output: "commits.jsonl" issues_output: "issues.jsonl" + +# GitHub API configuration +github_api_url: "https://api.github.com/graphql" +pagination_limit: 100 +date_range_days: 365 +request_timeout: 30 diff --git a/github_client.py b/github_client.py index 8a6314f..b48bd50 100644 --- a/github_client.py +++ b/github_client.py @@ -6,11 +6,14 @@ import logging from datetime import datetime, timedelta from typing import Dict, Optional, Any +from config import Configuration logger = logging.getLogger(__name__) class GitHubClient: - def __init__(self): + def __init__(self, config: Configuration): + self.config = config + # Check if required environment variables are set self.github_token = os.environ.get('GITHUB_TOKEN') @@ -25,12 +28,12 @@ def __init__(self): } # GraphQL endpoint - self.url = 'https://api.github.com/graphql' + self.url = self.config.github_api_url - # Calculate the date for one year ago - self.one_year_ago = (datetime.now() - timedelta(days=365)).strftime('%Y-%m-%dT%H:%M:%SZ') + # Calculate the date for specified days ago + self.date_range_ago = (datetime.now() - timedelta(days=self.config.date_range_days)).strftime('%Y-%m-%dT%H:%M:%SZ') - # GraphQL Queries + # GraphQL Queries with configurable pagination limit self.ROOT_FILES_QUERY = ''' { repository(owner: "%s", name: "%s") { @@ -55,105 +58,110 @@ def __init__(self): } ''' - self.RELEASES_QUERY = ''' - query($cursor: String) { - repository(owner: "%s", name: "%s") { - releases(first: 100, orderBy: {field: CREATED_AT, direction: DESC}, after: $cursor) { - edges { - node { + self.RELEASES_QUERY = f''' + query($cursor: String) {{ + repository(owner: "%s", name: "%s") {{ + releases(first: {self.config.pagination_limit}, orderBy: {{field: CREATED_AT, direction: DESC}}, after: $cursor) {{ + edges {{ + node {{ name publishedAt - } + }} cursor - } - pageInfo { + }} + pageInfo {{ hasNextPage endCursor - } - } - } - } + }} + }} + }} + }} ''' - self.CONTRIBUTORS_QUERY = ''' - query($cursor: String, $since: GitTimestamp!) { - repository(owner: "%s", name: "%s") { - defaultBranchRef { - target { - ... on Commit { - history(first: 100, since: $since, after: $cursor) { - nodes { - author { - user { + self.CONTRIBUTORS_QUERY = f''' + query($cursor: String, $since: GitTimestamp!) {{ + repository(owner: "%s", name: "%s") {{ + defaultBranchRef {{ + target {{ + ... on Commit {{ + history(first: {self.config.pagination_limit}, since: $since, after: $cursor) {{ + nodes {{ + author {{ + user {{ login - } - } + }} + }} committedDate - } - pageInfo { + }} + pageInfo {{ hasNextPage endCursor - } - } - } - } - } - } - } + }} + }} + }} + }} + }} + }} + }} ''' - self.COMMITS_QUERY = ''' - query($cursor: String, $since: GitTimestamp!) { - repository(owner: "%s", name: "%s") { - defaultBranchRef { - target { - ... on Commit { - history(first: 100, since: $since, after: $cursor) { - nodes { + self.COMMITS_QUERY = f''' + query($cursor: String, $since: GitTimestamp!) {{ + repository(owner: "%s", name: "%s") {{ + defaultBranchRef {{ + target {{ + ... on Commit {{ + history(first: {self.config.pagination_limit}, since: $since, after: $cursor) {{ + nodes {{ messageHeadline committedDate - author { + author {{ name - } - } - pageInfo { + }} + }} + pageInfo {{ hasNextPage endCursor - } - } - } - } - } - } - } + }} + }} + }} + }} + }} + }} + }} ''' - self.ISSUES_QUERY = ''' - query($cursor: String) { - repository(owner: "%s", name: "%s") { - issues(first: 100, states: [OPEN, CLOSED], after: $cursor, orderBy: {field: CREATED_AT, direction: DESC}) { - nodes { + self.ISSUES_QUERY = f''' + query($cursor: String) {{ + repository(owner: "%s", name: "%s") {{ + issues(first: {self.config.pagination_limit}, states: [OPEN, CLOSED], after: $cursor, orderBy: {{field: CREATED_AT, direction: DESC}}) {{ + nodes {{ title state - author { + author {{ login - } + }} createdAt - } - pageInfo { + }} + pageInfo {{ hasNextPage endCursor - } - } - } - } + }} + }} + }} + }} ''' def run_query(self, query: str, variables: Dict = {}) -> Optional[Dict[Any, Any]]: """Run a GraphQL query with variables and return the response""" try: logger.debug(f"Running query with variables: {variables}") - response = requests.post(self.url, headers=self.headers, json={'query': query, 'variables': variables}) + response = requests.post( + self.url, + headers=self.headers, + json={'query': query, 'variables': variables}, + timeout=self.config.request_timeout + ) if response.status_code == 200: logger.debug("Query successful") return response.json() diff --git a/metrics_processor.py b/metrics_processor.py index 474ccd3..b3549d2 100644 --- a/metrics_processor.py +++ b/metrics_processor.py @@ -3,12 +3,14 @@ import logging from typing import Dict, List, Tuple, Optional, Any from github_client import GitHubClient +from config import Configuration logger = logging.getLogger(__name__) class MetricsProcessor: - def __init__(self, github_client: GitHubClient): + def __init__(self, github_client: GitHubClient, config: Configuration): self.github_client = github_client + self.config = config def get_root_md_files(self, owner: str, repo_name: str) -> List[str]: """Get all .md files in the root folder""" @@ -54,24 +56,24 @@ def _extract_releases(self, result: Dict) -> Tuple[List[Dict[str, str]], Optiona # Filter releases from the past year for edge in release_edges: published_at = edge['node']['publishedAt'] - if published_at and published_at >= self.github_client.one_year_ago: + if published_at and published_at >= self.github_client.date_range_ago: releases.append({ 'name': edge['node']['name'] or 'Unnamed release', 'publishedAt': published_at }) - elif published_at and published_at < self.github_client.one_year_ago: - # Stop pagination if we've gone past the one-year boundary - logger.debug("Reached releases older than one year, stopping pagination") + elif published_at and published_at < self.github_client.date_range_ago: + # Stop pagination if we've gone past the date range boundary + logger.debug(f"Reached releases older than {self.config.date_range_days} days, stopping pagination") return releases, None return releases, page_info def get_releases(self, owner: str, repo_name: str) -> List[Dict[str, str]]: - """Get all releases with timestamps from the past year""" + """Get all releases with timestamps from the past date range""" logger.info("Checking releases...") query = self.github_client.RELEASES_QUERY % (owner, repo_name) releases = self._paginate_github_query(query, self._extract_releases) - logger.info(f"Found {len(releases)} releases in the past year") + logger.info(f"Found {len(releases)} releases in the past {self.config.date_range_days} days") return releases def _extract_contributors(self, result: Dict) -> Tuple[List[Dict[str, str]], Optional[Dict]]: @@ -99,10 +101,10 @@ def _extract_contributors(self, result: Dict) -> Tuple[List[Dict[str, str]], Opt return [contributors], page_info def get_contributors(self, owner: str, repo_name: str) -> Dict[str, str]: - """Get all contributors with their most recent contribution date from the past year""" + """Get all contributors with their most recent contribution date from the past date range""" logger.info("Checking contributors...") query = self.github_client.CONTRIBUTORS_QUERY % (owner, repo_name) - contributor_list = self._paginate_github_query(query, self._extract_contributors, {'since': self.github_client.one_year_ago}) + contributor_list = self._paginate_github_query(query, self._extract_contributors, {'since': self.github_client.date_range_ago}) # Merge all contributor dictionaries final_contributors: Dict[str, str] = {} @@ -111,7 +113,7 @@ def get_contributors(self, owner: str, repo_name: str) -> Dict[str, str]: if login not in final_contributors or date > final_contributors[login]: final_contributors[login] = date - logger.info(f"Found {len(final_contributors)} contributors in the past year") + logger.info(f"Found {len(final_contributors)} contributors in the past {self.config.date_range_days} days") return final_contributors def _extract_commits(self, result: Dict) -> Tuple[List[Dict[str, str]], Optional[Dict]]: @@ -138,11 +140,11 @@ def _extract_commits(self, result: Dict) -> Tuple[List[Dict[str, str]], Optional return commits, page_info def get_commits(self, owner: str, repo_name: str) -> List[Dict[str, str]]: - """Get all commits from the past year""" + """Get all commits from the past date range""" logger.info("Checking commits...") query = self.github_client.COMMITS_QUERY % (owner, repo_name) - commits = self._paginate_github_query(query, self._extract_commits, {'since': self.github_client.one_year_ago}) - logger.info(f"Found {len(commits)} commits in the past year") + commits = self._paginate_github_query(query, self._extract_commits, {'since': self.github_client.date_range_ago}) + logger.info(f"Found {len(commits)} commits in the past {self.config.date_range_days} days") return commits def _extract_issues(self, result: Dict) -> Tuple[List[Dict[str, str]], Optional[Dict]]: @@ -156,29 +158,29 @@ def _extract_issues(self, result: Dict) -> Tuple[List[Dict[str, str]], Optional[ issue_nodes = repo_data['issues']['nodes'] page_info = repo_data['issues']['pageInfo'] - # Filter issues from the past year + # Filter issues from the past date range for issue in issue_nodes: created_at = issue.get('createdAt') - if created_at and created_at >= self.github_client.one_year_ago: + if created_at and created_at >= self.github_client.date_range_ago: issues.append({ 'title': issue.get('title', ''), 'state': issue.get('state', ''), 'author': issue.get('author', {}).get('login', 'Unknown') if issue.get('author') else 'Unknown', 'createdAt': created_at }) - elif created_at and created_at < self.github_client.one_year_ago: - # Stop pagination if we've gone past the one-year boundary - logger.debug("Reached issues older than one year, stopping pagination") + elif created_at and created_at < self.github_client.date_range_ago: + # Stop pagination if we've gone past the date range boundary + logger.debug(f"Reached issues older than {self.config.date_range_days} days, stopping pagination") return issues, None return issues, page_info def get_issues(self, owner: str, repo_name: str) -> List[Dict[str, str]]: - """Get all issues with creator and status from the past year""" + """Get all issues with creator and status from the past date range""" logger.info("Checking issues...") query = self.github_client.ISSUES_QUERY % (owner, repo_name) issues = self._paginate_github_query(query, self._extract_issues) - logger.info(f"Found {len(issues)} issues in the past year") + logger.info(f"Found {len(issues)} issues in the past {self.config.date_range_days} days") return issues def _paginate_github_query( From c8f01871298f22315cf24e173bf37808378c8f48 Mon Sep 17 00:00:00 2001 From: "snowpoke (aider)" Date: Mon, 22 Sep 2025 22:08:33 +0200 Subject: [PATCH 23/36] fix: Pass required configuration object to GitHubClient constructor --- metrics_check.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/metrics_check.py b/metrics_check.py index 67b7454..59102e2 100644 --- a/metrics_check.py +++ b/metrics_check.py @@ -25,9 +25,9 @@ def main() -> None: try: # Initialize components config = Configuration() - github_client = GitHubClient() - metrics_processor = MetricsProcessor(github_client) - file_writer = FileWriter() + github_client = GitHubClient(config) + metrics_processor = MetricsProcessor(github_client, config) + file_writer = FileWriter(config) owner = config.owner repo_name = config.repo_name From 3287b39101f60219b70fcfd85b62a404e96fa89c Mon Sep 17 00:00:00 2001 From: "snowpoke (aider)" Date: Mon, 22 Sep 2025 22:16:12 +0200 Subject: [PATCH 24/36] refactor: restructure config.yaml to use nested repo and output sections --- config.py | 28 +++++++++++++++++----------- config.yaml | 22 ++++++++++++---------- 2 files changed, 29 insertions(+), 21 deletions(-) diff --git a/config.py b/config.py index afd5b19..ebd4499 100644 --- a/config.py +++ b/config.py @@ -33,24 +33,30 @@ def _load_config(self, config_file: str) -> None: try: with open(config_file, 'r') as f: config = yaml.safe_load(f) - self.owner = config.get('owner') - self.repo_name = config.get('repo_name') - self.output_dir = config.get('output_dir', '.') - self.root_md_files_output = config.get('root_md_files_output', 'root_md_files.jsonl') - self.license_output = config.get('license_output', 'license.jsonl') - self.releases_output = config.get('releases_output', 'releases.jsonl') - self.contributors_output = config.get('contributors_output', 'contributors.jsonl') - self.commits_output = config.get('commits_output', 'commits.jsonl') - self.issues_output = config.get('issues_output', 'issues.jsonl') - # GitHub API configuration + # Load repository configuration + repo_config = config.get('repo', {}) + self.owner = repo_config.get('owner') + self.repo_name = repo_config.get('name') + + # Load output configuration + output_config = config.get('output', {}) + self.output_dir = output_config.get('directory', '.') + self.root_md_files_output = output_config.get('root_md_files', 'root_md_files.jsonl') + self.license_output = output_config.get('license', 'license.jsonl') + self.releases_output = output_config.get('releases', 'releases.jsonl') + self.contributors_output = output_config.get('contributors', 'contributors.jsonl') + self.commits_output = output_config.get('commits', 'commits.jsonl') + self.issues_output = output_config.get('issues', 'issues.jsonl') + + # Load GitHub API configuration self.github_api_url = config.get('github_api_url', 'https://api.github.com/graphql') self.pagination_limit = config.get('pagination_limit', 100) self.date_range_days = config.get('date_range_days', 365) self.request_timeout = config.get('request_timeout', 30) if not self.owner or not self.repo_name: - logger.error("Please ensure config.yaml contains 'owner' and 'repo_name' keys.") + logger.error("Please ensure config.yaml contains 'repo.owner' and 'repo.name' keys.") sys.exit(1) except FileNotFoundError: diff --git a/config.yaml b/config.yaml index 582856d..670e3c8 100644 --- a/config.yaml +++ b/config.yaml @@ -1,14 +1,16 @@ -owner: "duckdb" -repo_name: "duckdb-wasm" -output_dir: "." -root_md_files_output: "root_md_files.jsonl" -license_output: "license.jsonl" -releases_output: "releases.jsonl" -contributors_output: "contributors.jsonl" -commits_output: "commits.jsonl" -issues_output: "issues.jsonl" +repo: + owner: "duckdb" + name: "duckdb-wasm" + +output: + directory: "." + root_md_files: "root_md_files.jsonl" + license: "license.jsonl" + releases: "releases.jsonl" + contributors: "contributors.jsonl" + commits: "commits.jsonl" + issues: "issues.jsonl" -# GitHub API configuration github_api_url: "https://api.github.com/graphql" pagination_limit: 100 date_range_days: 365 From 99760c87436bc1407cb288a29f1fd4fe121708c0 Mon Sep 17 00:00:00 2001 From: "snowpoke (aider)" Date: Mon, 22 Sep 2025 22:20:13 +0200 Subject: [PATCH 25/36] feat: Add support for repo owner/name and config file from environment variables with precedence over config file --- config.py | 18 +++++++++++------- metrics_check.py | 11 ++++++++++- 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/config.py b/config.py index ebd4499..a3380b4 100644 --- a/config.py +++ b/config.py @@ -3,15 +3,19 @@ import yaml import sys import logging +import os from typing import Optional logger = logging.getLogger(__name__) class Configuration: - def __init__(self, config_file: str = 'config.yaml'): + def __init__(self, config_file: Optional[str] = None): + # Get config file path from environment variable or use default + self.config_file: str = config_file or os.environ.get('CONFIG_FILE', 'config.yaml') + + # Initialize default values self.owner: Optional[str] = None self.repo_name: Optional[str] = None - self.config_file: str = config_file self.output_dir: str = '.' self.root_md_files_output: str = 'root_md_files.jsonl' self.license_output: str = 'license.jsonl' @@ -26,7 +30,11 @@ def __init__(self, config_file: str = 'config.yaml'): self.date_range_days: int = 365 self.request_timeout: int = 30 - self._load_config(config_file) + self._load_config(self.config_file) + + # Override with environment variables if set + self.owner = os.environ.get('REPO_OWNER') or self.owner + self.repo_name = os.environ.get('REPO_NAME') or self.repo_name def _load_config(self, config_file: str) -> None: """Load configuration from YAML file""" @@ -55,10 +63,6 @@ def _load_config(self, config_file: str) -> None: self.date_range_days = config.get('date_range_days', 365) self.request_timeout = config.get('request_timeout', 30) - if not self.owner or not self.repo_name: - logger.error("Please ensure config.yaml contains 'repo.owner' and 'repo.name' keys.") - sys.exit(1) - except FileNotFoundError: logger.error(f"{config_file} file not found.") sys.exit(1) diff --git a/metrics_check.py b/metrics_check.py index 59102e2..a43a891 100644 --- a/metrics_check.py +++ b/metrics_check.py @@ -2,6 +2,7 @@ import sys import logging +import os from datetime import datetime from github_client import GitHubClient from metrics_processor import MetricsProcessor @@ -24,7 +25,15 @@ def main() -> None: try: # Initialize components - config = Configuration() + # Get config file from environment variable or use None to let Configuration class handle it + config_file = os.environ.get('CONFIG_FILE') + config = Configuration(config_file) + + # Check if owner and repo_name are set + if not config.owner or not config.repo_name: + logger.error("Please ensure repo.owner and repo.name are set either in config file or via environment variables (REPO_OWNER, REPO_NAME).") + sys.exit(1) + github_client = GitHubClient(config) metrics_processor = MetricsProcessor(github_client, config) file_writer = FileWriter(config) From e13fc206a3bd1c3d255a38454325a70570b96eea Mon Sep 17 00:00:00 2001 From: snowpoke Date: Mon, 22 Sep 2025 22:23:40 +0200 Subject: [PATCH 26/36] feat: put output in folder by default --- .gitignore | 1 + config.yaml | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 1342bb7..d24fbff 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,4 @@ __pycache__ *.jsonl node_modules venv +output diff --git a/config.yaml b/config.yaml index 670e3c8..bd67ba3 100644 --- a/config.yaml +++ b/config.yaml @@ -3,7 +3,7 @@ repo: name: "duckdb-wasm" output: - directory: "." + directory: "output" root_md_files: "root_md_files.jsonl" license: "license.jsonl" releases: "releases.jsonl" From d5819a1e8beed8913d8aa59e2a0d76b9134fb77b Mon Sep 17 00:00:00 2001 From: snowpoke Date: Mon, 22 Sep 2025 22:33:38 +0200 Subject: [PATCH 27/36] feat: refactor folder structure --- config.yaml => config/config.yaml | 0 config.py => src/config.py | 2 +- file_writer.py => src/file_writer.py | 0 github_client.py => src/github_client.py | 0 metrics_check.py => src/metrics_check.py | 0 metrics_processor.py => src/metrics_processor.py | 0 6 files changed, 1 insertion(+), 1 deletion(-) rename config.yaml => config/config.yaml (100%) rename config.py => src/config.py (98%) rename file_writer.py => src/file_writer.py (100%) rename github_client.py => src/github_client.py (100%) rename metrics_check.py => src/metrics_check.py (100%) rename metrics_processor.py => src/metrics_processor.py (100%) diff --git a/config.yaml b/config/config.yaml similarity index 100% rename from config.yaml rename to config/config.yaml diff --git a/config.py b/src/config.py similarity index 98% rename from config.py rename to src/config.py index a3380b4..35ed0a2 100644 --- a/config.py +++ b/src/config.py @@ -11,7 +11,7 @@ class Configuration: def __init__(self, config_file: Optional[str] = None): # Get config file path from environment variable or use default - self.config_file: str = config_file or os.environ.get('CONFIG_FILE', 'config.yaml') + self.config_file: str = config_file or os.environ.get('CONFIG_FILE', 'config/config.yaml') # Initialize default values self.owner: Optional[str] = None diff --git a/file_writer.py b/src/file_writer.py similarity index 100% rename from file_writer.py rename to src/file_writer.py diff --git a/github_client.py b/src/github_client.py similarity index 100% rename from github_client.py rename to src/github_client.py diff --git a/metrics_check.py b/src/metrics_check.py similarity index 100% rename from metrics_check.py rename to src/metrics_check.py diff --git a/metrics_processor.py b/src/metrics_processor.py similarity index 100% rename from metrics_processor.py rename to src/metrics_processor.py From b7171c4a2276f2302f1a5043d62a3d1cef0641cb Mon Sep 17 00:00:00 2001 From: "snowpoke (aider)" Date: Mon, 22 Sep 2025 23:02:22 +0200 Subject: [PATCH 28/36] docs: add comprehensive README with installation, configuration, and usage instructions --- README2.md | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 README2.md diff --git a/README2.md b/README2.md new file mode 100644 index 0000000..3789989 --- /dev/null +++ b/README2.md @@ -0,0 +1,39 @@ +# GitHub Repository Metrics Collector + +This project collects various metrics from GitHub repositories using the GitHub GraphQL API and outputs them as JSONL files. It's designed to help analyze repository activity, contributors, issues, and other key metrics. + +## Features + +- Collects root markdown files +- Retrieves repository license information +- Gathers release information with timestamps +- Tracks contributors and their most recent contribution dates +- Collects commit history +- Records issue information including creators and status + +## Prerequisites + +- Python 3.7+ +- A GitHub personal access token with appropriate permissions + +## Installation + +1. Clone the repository: + ```bash + git clone + cd + ``` + +2. Install the required dependencies: + ```bash + pip install -r requirements.txt + ``` + +## Configuration + +The program can be configured using both a YAML configuration file and environment variables. Environment variables take precedence over the configuration file. + +### Configuration File (config/config.yaml) + +The default configuration file is located at `config/config.yaml`: + From c156f95881ace6dd4d2ff3b22bed3e23a068c726 Mon Sep 17 00:00:00 2001 From: snowpoke Date: Mon, 22 Sep 2025 23:05:18 +0200 Subject: [PATCH 29/36] docs: update README to clarify markdown file collection and Python version requirement --- README2.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README2.md b/README2.md index 3789989..1fadb8c 100644 --- a/README2.md +++ b/README2.md @@ -4,17 +4,17 @@ This project collects various metrics from GitHub repositories using the GitHub ## Features -- Collects root markdown files +- Collects information on markdown files at repository root (e.g. README.md, LICENSE.md) - Retrieves repository license information -- Gathers release information with timestamps +- Gathers list of releases, with timestamps - Tracks contributors and their most recent contribution dates - Collects commit history - Records issue information including creators and status ## Prerequisites -- Python 3.7+ -- A GitHub personal access token with appropriate permissions +- Python 3 +- A GitHub personal access token with appropriate repository read permissions ## Installation From 9823f10ab691847657aea86205874712b07bb556 Mon Sep 17 00:00:00 2001 From: snowpoke Date: Mon, 22 Sep 2025 23:10:03 +0200 Subject: [PATCH 30/36] feat: convert into single README file --- README.md | 159 +++++++++++++++++++++++++++++++++++++++++++++++++++++ README2.md | 39 ------------- 2 files changed, 159 insertions(+), 39 deletions(-) delete mode 100644 README2.md diff --git a/README.md b/README.md index 7bf9b2f..caee5e1 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,162 @@ +# GitHub Repository Metrics Collector + +This project collects various metrics from GitHub repositories using the GitHub GraphQL API and outputs them as JSONL files. It's designed to help analyze repository activity, contributors, issues, and other key metrics. + +## Features + +- Collects information on markdown files at repository root (e.g. README.md, LICENSE.md) +- Retrieves repository license information +- Gathers list of releases, with timestamps +- Tracks contributors and their most recent contribution dates +- Collects commit history +- Records issue information including creators and status + +## Prerequisites + +- Python 3 +- A GitHub personal access token with appropriate repository read permissions + +## Installation + +1. Clone the repository: + ```bash + git clone + cd + ``` + +2. Install the required dependencies: + ```bash + pip install -r requirements.txt + ``` + +## Configuration + +The program can be configured using both a YAML configuration file and environment variables. Environment variables take precedence over the configuration file. + +### Configuration File (config/config.yaml) + +The default configuration file is located at `config/config.yaml`: + +```yaml +repo: + owner: "duckdb" # Repository owner/organization + name: "duckdb-wasm" # Repository name + +output: + directory: "output" # Output directory for JSONL files + root_md_files: "root_md_files.jsonl" + license: "license.jsonl" + releases: "releases.jsonl" + contributors: "contributors.jsonl" + commits: "commits.jsonl" + issues: "issues.jsonl" + +github_api_url: "https://api.github.com/graphql" +pagination_limit: 100 # Number of items per API request +date_range_days: 365 # How far back to collect data (in days) +request_timeout: 30 # API request timeout in seconds +``` + +### Environment Variables + +You can override configuration values using environment variables: + +- `GITHUB_TOKEN`: (Required) Your GitHub personal access token +- `CONFIG_FILE`: Path to custom configuration file (optional) +- `REPO_OWNER`: Override repository owner from config +- `REPO_NAME`: Override repository name from config + +To create a GitHub personal access token: + +1. Go to GitHub Settings > Developer settings > Personal access tokens +2. Generate a new token with repo scope +3. Copy the token for use with this application + +## Running the Program + +1. Set your GitHub token as an environment variable: + ```bash + export GITHUB_TOKEN=your_github_token_here + ``` + +2. Run the metrics collection script: + ```bash + python src/metrics_check.py + ``` + +3. To use a custom configuration file: + ```bash + export CONFIG_FILE=path/to/your/config.yaml + python src/metrics_check.py + ``` + +4. To override repository owner/name: + ```bash + export REPO_OWNER=your_owner + export REPO_NAME=your_repo + python src/metrics_check.py + ``` + +## Output Files + +All output files are saved in JSONL format (JSON Lines), with one JSON object per line. By default, files are saved to the `output/` directory. + +### root_md_files.jsonl + +Contains names of all markdown files in the repository root: + +```json +{"file": "README.md"} +{"file": "CONTRIBUTING.md"} +``` + +### license.jsonl + +Contains the repository license information: + +```json +{"license": "MIT License"} +``` + +### releases.jsonl + +Contains release information with timestamps: + +```json +{"name": "v1.0.0", "publishedAt": "2023-01-15T10:30:00Z"} +``` + +### contributors.jsonl + +Contains contributors with their most recent contribution date: + +```json +{"login": "username", "last_contribution": "2023-05-20T14:22:30Z"} +``` + +### commits.jsonl + +Contains commit information: + +```json +{"message": "Fix bug in parser", "date": "2023-05-19T09:15:00Z", "author": "Developer Name"} +``` + +### issues.jsonl + +Contains issue information: + +```json +{"title": "Bug in authentication", "state": "CLOSED", "author": "user123", "createdAt": "2023-04-10T16:45:00Z"} +``` + +## Customization + +You can modify the date range for data collection by changing the `date_range_days` value in the configuration. The default is 365 days (1 year). + +The pagination limit can also be adjusted with `pagination_limit` to control how many items are fetched per API request. +--- + # 🚀 Health Analyzer PoC > Reducing Risk in Open Source Adoption diff --git a/README2.md b/README2.md deleted file mode 100644 index 1fadb8c..0000000 --- a/README2.md +++ /dev/null @@ -1,39 +0,0 @@ -# GitHub Repository Metrics Collector - -This project collects various metrics from GitHub repositories using the GitHub GraphQL API and outputs them as JSONL files. It's designed to help analyze repository activity, contributors, issues, and other key metrics. - -## Features - -- Collects information on markdown files at repository root (e.g. README.md, LICENSE.md) -- Retrieves repository license information -- Gathers list of releases, with timestamps -- Tracks contributors and their most recent contribution dates -- Collects commit history -- Records issue information including creators and status - -## Prerequisites - -- Python 3 -- A GitHub personal access token with appropriate repository read permissions - -## Installation - -1. Clone the repository: - ```bash - git clone - cd - ``` - -2. Install the required dependencies: - ```bash - pip install -r requirements.txt - ``` - -## Configuration - -The program can be configured using both a YAML configuration file and environment variables. Environment variables take precedence over the configuration file. - -### Configuration File (config/config.yaml) - -The default configuration file is located at `config/config.yaml`: - From 565e5eb7cea70bfe3e0637f4ff8ef873f4ce302a Mon Sep 17 00:00:00 2001 From: "snowpoke (aider)" Date: Mon, 22 Sep 2025 23:11:49 +0200 Subject: [PATCH 31/36] fix: move GITHUB_TOKEN access from GitHubClient to Configuration class --- src/config.py | 3 +++ src/github_client.py | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/config.py b/src/config.py index 35ed0a2..dd79ac2 100644 --- a/src/config.py +++ b/src/config.py @@ -26,6 +26,7 @@ def __init__(self, config_file: Optional[str] = None): # GitHub API configuration self.github_api_url: str = 'https://api.github.com/graphql' + self.github_token: Optional[str] = None self.pagination_limit: int = 100 self.date_range_days: int = 365 self.request_timeout: int = 30 @@ -35,6 +36,7 @@ def __init__(self, config_file: Optional[str] = None): # Override with environment variables if set self.owner = os.environ.get('REPO_OWNER') or self.owner self.repo_name = os.environ.get('REPO_NAME') or self.repo_name + self.github_token = os.environ.get('GITHUB_TOKEN') or self.github_token def _load_config(self, config_file: str) -> None: """Load configuration from YAML file""" @@ -59,6 +61,7 @@ def _load_config(self, config_file: str) -> None: # Load GitHub API configuration self.github_api_url = config.get('github_api_url', 'https://api.github.com/graphql') + self.github_token = config.get('github_token') # Allow token in config file self.pagination_limit = config.get('pagination_limit', 100) self.date_range_days = config.get('date_range_days', 365) self.request_timeout = config.get('request_timeout', 30) diff --git a/src/github_client.py b/src/github_client.py index b48bd50..e214f6e 100644 --- a/src/github_client.py +++ b/src/github_client.py @@ -14,8 +14,8 @@ class GitHubClient: def __init__(self, config: Configuration): self.config = config - # Check if required environment variables are set - self.github_token = os.environ.get('GITHUB_TOKEN') + # Check if GitHub token is set in config (from env var or config file) + self.github_token = self.config.github_token if not self.github_token: logger.error("Please set GITHUB_TOKEN environment variable.") From a322fc6791407ccd14a71217141edf502637d727 Mon Sep 17 00:00:00 2001 From: "snowpoke (aider)" Date: Mon, 22 Sep 2025 23:14:39 +0200 Subject: [PATCH 32/36] feat: add support for GITHUB_TOKEN_FILE environment variable to read token from file --- README.md | 12 ++++++++++++ src/config.py | 37 ++++++++++++++++++++++++++++++++++++- 2 files changed, 48 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index caee5e1..d6f273f 100644 --- a/README.md +++ b/README.md @@ -62,22 +62,34 @@ request_timeout: 30 # API request timeout in seconds You can override configuration values using environment variables: - `GITHUB_TOKEN`: (Required) Your GitHub personal access token +- `GITHUB_TOKEN_FILE`: (Alternative to `GITHUB_TOKEN`) Path to a file containing your GitHub personal access token - `CONFIG_FILE`: Path to custom configuration file (optional) - `REPO_OWNER`: Override repository owner from config - `REPO_NAME`: Override repository name from config +Note: You must provide either `GITHUB_TOKEN` or `GITHUB_TOKEN_FILE`, but not both. + To create a GitHub personal access token: 1. Go to GitHub Settings > Developer settings > Personal access tokens 2. Generate a new token with repo scope 3. Copy the token for use with this application +To use a token file: +1. Create a file containing only your GitHub token (no extra characters or newlines) +2. Set the `GITHUB_TOKEN_FILE` environment variable to the path of this file + ## Running the Program 1. Set your GitHub token as an environment variable: ```bash export GITHUB_TOKEN=your_github_token_here ``` + + OR set the path to a token file: + ```bash + export GITHUB_TOKEN_FILE=/path/to/your/token/file + ``` 2. Run the metrics collection script: ```bash diff --git a/src/config.py b/src/config.py index dd79ac2..b0d045c 100644 --- a/src/config.py +++ b/src/config.py @@ -36,7 +36,42 @@ def __init__(self, config_file: Optional[str] = None): # Override with environment variables if set self.owner = os.environ.get('REPO_OWNER') or self.owner self.repo_name = os.environ.get('REPO_NAME') or self.repo_name - self.github_token = os.environ.get('GITHUB_TOKEN') or self.github_token + self.github_token = self._get_github_token() + + def _get_github_token(self) -> Optional[str]: + """Get GitHub token from environment variables or config file""" + github_token = self.github_token # From config file + + # Check for GITHUB_TOKEN environment variable + env_token = os.environ.get('GITHUB_TOKEN') + + # Check for GITHUB_TOKEN_FILE environment variable + token_file_path = os.environ.get('GITHUB_TOKEN_FILE') + + # Validate that only one token source is used + if env_token and token_file_path: + logger.error("Both GITHUB_TOKEN and GITHUB_TOKEN_FILE environment variables are set. Please use only one.") + sys.exit(1) + + # Priority: GITHUB_TOKEN env var > GITHUB_TOKEN_FILE env var > config file + if env_token: + return env_token + elif token_file_path: + try: + with open(token_file_path, 'r') as f: + token = f.read().strip() + if not token: + logger.error(f"Token file {token_file_path} is empty.") + sys.exit(1) + return token + except FileNotFoundError: + logger.error(f"Token file {token_file_path} not found.") + sys.exit(1) + except Exception as e: + logger.error(f"Error reading token file {token_file_path}: {e}") + sys.exit(1) + + return github_token def _load_config(self, config_file: str) -> None: """Load configuration from YAML file""" From a963479d262b9277c0b7e54f50b4a9b51a58826c Mon Sep 17 00:00:00 2001 From: snowpoke Date: Mon, 22 Sep 2025 23:17:26 +0200 Subject: [PATCH 33/36] feat: ignore .token file --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index d24fbff..a02fbf0 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,4 @@ __pycache__ node_modules venv output +.token From 11095fc10c38f17290b4c669ef2dacab1b0dff4e Mon Sep 17 00:00:00 2001 From: "snowpoke (aider)" Date: Mon, 22 Sep 2025 23:18:14 +0200 Subject: [PATCH 34/36] feat: add Dockerfile for project containerization --- Dockerfile | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 Dockerfile diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..362c274 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,34 @@ +# Use Python 3.9 slim image as base +FROM python:3.9-slim + +# Set working directory +WORKDIR /app + +# Copy requirements file +COPY requirements.txt . + +# Install Python dependencies +RUN pip install --no-cache-dir -r requirements.txt + +# Copy source code +COPY src/ ./src/ + +# Copy config directory +COPY config/ ./config/ + +# Copy the main script +COPY src/metrics_check.py . + +# Create output directory +RUN mkdir -p output + +# Set environment variables with defaults +ENV CONFIG_FILE=config/config.yaml +ENV OUTPUT_DIR=output + +# Create a non-root user for security +RUN useradd --create-home --shell /bin/bash appuser +USER appuser + +# Set entrypoint +ENTRYPOINT ["python", "metrics_check.py"] From 5807a67ce244867937bdf69cf1e99b325dbc7386 Mon Sep 17 00:00:00 2001 From: snowpoke Date: Mon, 22 Sep 2025 23:35:28 +0200 Subject: [PATCH 35/36] feat: security --- Dockerfile | 33 +++++++++------------------------ requirements.txt | 4 ++-- 2 files changed, 11 insertions(+), 26 deletions(-) diff --git a/Dockerfile b/Dockerfile index 362c274..1d7625b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,34 +1,19 @@ -# Use Python 3.9 slim image as base -FROM python:3.9-slim +FROM python:3.13-slim -# Set working directory WORKDIR /app -# Copy requirements file COPY requirements.txt . -# Install Python dependencies +# Run as non-root user +RUN useradd --create-home --shell /bin/bash appuser \ + && mkdir -p output \ + && chown -R appuser:appuser output + +USER appuser + RUN pip install --no-cache-dir -r requirements.txt -# Copy source code COPY src/ ./src/ - -# Copy config directory COPY config/ ./config/ -# Copy the main script -COPY src/metrics_check.py . - -# Create output directory -RUN mkdir -p output - -# Set environment variables with defaults -ENV CONFIG_FILE=config/config.yaml -ENV OUTPUT_DIR=output - -# Create a non-root user for security -RUN useradd --create-home --shell /bin/bash appuser -USER appuser - -# Set entrypoint -ENTRYPOINT ["python", "metrics_check.py"] +ENTRYPOINT ["python", "./src/metrics_check.py"] diff --git a/requirements.txt b/requirements.txt index 33f05f7..0808481 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,2 @@ -requests -PyYAML +requests==2.32.5 +PyYAML==6.0.2 From 625f86ae2829f48062d037a93f9a23bb4e7d4865 Mon Sep 17 00:00:00 2001 From: "snowpoke (aider)" Date: Mon, 22 Sep 2025 23:43:04 +0200 Subject: [PATCH 36/36] docs: add Docker container usage instructions to README.md --- README.md | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/README.md b/README.md index d6f273f..02e56b0 100644 --- a/README.md +++ b/README.md @@ -81,6 +81,8 @@ To use a token file: ## Running the Program +### Running with Python + 1. Set your GitHub token as an environment variable: ```bash export GITHUB_TOKEN=your_github_token_here @@ -109,6 +111,51 @@ To use a token file: python src/metrics_check.py ``` +### Running with Docker + +You can run the application using the pre-built Docker image: + +1. Pull the image: + ```bash + docker pull codeberg.org/0xf1e/project-health-analyzer:latest + ``` + +2. Run the container with your GitHub token: + ```bash + docker run --rm \ + -e GITHUB_TOKEN=your_github_token_here \ + -v $(pwd)/output:/app/output \ + codeberg.org/0xf1e/project-health-analyzer:latest + ``` + +3. To use a token file: + ```bash + docker run --rm \ + -e GITHUB_TOKEN_FILE=/app/token.txt \ + -v /path/to/your/token/file:/app/token.txt \ + -v $(pwd)/output:/app/output \ + codeberg.org/0xf1e/project-health-analyzer:latest + ``` + +4. To use a custom configuration file: + ```bash + docker run --rm \ + -e GITHUB_TOKEN=your_github_token_here \ + -v /path/to/your/config.yaml:/app/config/config.yaml \ + -v $(pwd)/output:/app/output \ + codeberg.org/0xf1e/project-health-analyzer:latest + ``` + +5. To override repository owner/name: + ```bash + docker run --rm \ + -e GITHUB_TOKEN=your_github_token_here \ + -e REPO_OWNER=your_owner \ + -e REPO_NAME=your_repo \ + -v $(pwd)/output:/app/output \ + codeberg.org/0xf1e/project-health-analyzer:latest + ``` + ## Output Files All output files are saved in JSONL format (JSON Lines), with one JSON object per line. By default, files are saved to the `output/` directory.