-
Couldn't load subscription status.
- Fork 119
[CI] Export scraped commit data to a BigQuery dataset #532
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
d9ddedb
42a8c38
cf1c414
cb81db0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3,6 +3,7 @@ | |
| import logging | ||
| import os | ||
| import git | ||
| from google.cloud import bigquery | ||
| import requests | ||
|
|
||
| GRAFANA_URL = ( | ||
|
|
@@ -11,6 +12,10 @@ | |
| GITHUB_GRAPHQL_API_URL = "https://api.github.com/graphql" | ||
| REPOSITORY_URL = "https://github.com/llvm/llvm-project.git" | ||
|
|
||
| # BigQuery dataset and tables to write metrics to. | ||
| OPERATIONAL_METRICS_DATASET = "operational_metrics" | ||
| LLVM_COMMITS_TABLE = "llvm_commits" | ||
|
|
||
| # How many commits to query the GitHub GraphQL API for at a time. | ||
| # Querying too many commits at once often leads to the call failing. | ||
| GITHUB_API_BATCH_SIZE = 50 | ||
|
|
@@ -27,11 +32,23 @@ | |
| commit_{commit_sha}: | ||
| object(oid:"{commit_sha}") {{ | ||
| ... on Commit {{ | ||
| author {{ | ||
| user {{ | ||
| login | ||
| }} | ||
| }} | ||
| associatedPullRequests(first: 1) {{ | ||
| totalCount | ||
| pullRequest: nodes {{ | ||
| number | ||
| reviewDecision | ||
| reviews(first: 10) {{ | ||
| nodes {{ | ||
| reviewer: author {{ | ||
| login | ||
| }} | ||
| }} | ||
| }} | ||
| }} | ||
| }} | ||
| }} | ||
|
|
@@ -42,12 +59,14 @@ | |
| @dataclasses.dataclass | ||
| class LLVMCommitInfo: | ||
| commit_sha: str | ||
| commit_datetime: datetime.datetime | ||
| commit_timestamp_seconds: int | ||
| files_modified: list[str] | ||
| commit_author: str = "" # GitHub username of author is unknown until API call | ||
| has_pull_request: bool = False | ||
| pr_number: int = 0 | ||
| pull_request_number: int = 0 | ||
| is_reviewed: bool = False | ||
| is_approved: bool = False | ||
| reviewers: set[str] = dataclasses.field(default_factory=set) | ||
|
|
||
|
|
||
| def scrape_new_commits_by_date( | ||
|
|
@@ -99,7 +118,9 @@ def query_for_reviews( | |
| # Create a map of commit sha to info | ||
| new_commits = { | ||
| commit.hexsha: LLVMCommitInfo( | ||
| commit.hexsha, commit.committed_datetime, commit.committed_date | ||
| commit_sha=commit.hexsha, | ||
| commit_timestamp_seconds=commit.committed_date, | ||
| files_modified=list(commit.stats.files.keys()), | ||
| ) | ||
| for commit in new_commits | ||
| } | ||
|
|
@@ -142,27 +163,33 @@ def query_for_reviews( | |
| ) | ||
| if response.status_code < 200 or response.status_code >= 300: | ||
| logging.error("Failed to query GitHub GraphQL API: %s", response.text) | ||
| exit(1) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you add a comment on why we want to fail hard instead of gracefully continuing? I'm presuming because if we miss an entire batch that's a pretty large chunk of data so failing gracefully doesn't make much sense? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's correct, I've added a comment per your suggestion |
||
| api_commit_data.update(response.json()["data"]["repository"]) | ||
|
|
||
| # Amend commit information with GitHub data | ||
| for commit_sha, data in api_commit_data.items(): | ||
| # Verify that push commit has no pull requests | ||
| commit_sha = commit_sha.removeprefix("commit_") | ||
| commit_info = new_commits[commit_sha] | ||
| commit_info.commit_author = data["author"]["user"]["login"] | ||
|
|
||
| # If commit has no pull requests, skip it. No data to update. | ||
| if data["associatedPullRequests"]["totalCount"] == 0: | ||
| continue | ||
|
|
||
| pull_request = data["associatedPullRequests"]["pullRequest"][0] | ||
| commit_info = new_commits[commit_sha] | ||
| commit_info.has_pull_request = True | ||
| commit_info.pr_number = pull_request["number"] | ||
| commit_info.pull_request_number = pull_request["number"] | ||
| commit_info.is_reviewed = pull_request["reviewDecision"] is not None | ||
| commit_info.is_approved = pull_request["reviewDecision"] == "APPROVED" | ||
| commit_info.reviewers = set([ | ||
| review["reviewer"]["login"] | ||
| for review in pull_request["reviews"]["nodes"] | ||
| ]) | ||
|
|
||
| return list(new_commits.values()) | ||
|
|
||
|
|
||
| def upload_daily_metrics( | ||
| def upload_daily_metrics_to_grafana( | ||
| grafana_api_key: str, | ||
| grafana_metrics_userid: str, | ||
| new_commits: list[LLVMCommitInfo], | ||
|
|
@@ -205,6 +232,22 @@ def upload_daily_metrics( | |
| logging.error("Failed to submit data to Grafana: %s", response.text) | ||
|
|
||
|
|
||
| def upload_daily_metrics_to_bigquery(new_commits: list[LLVMCommitInfo]) -> None: | ||
| """Upload processed commit metrics to a BigQuery dataset. | ||
|
|
||
| Args: | ||
| new_commits: List of commits to process & upload to BigQuery. | ||
| """ | ||
| bq_client = bigquery.Client() | ||
| table_ref = bq_client.dataset(OPERATIONAL_METRICS_DATASET).table( | ||
| LLVM_COMMITS_TABLE | ||
| ) | ||
| table = bq_client.get_table(table_ref) | ||
| commit_records = [dataclasses.asdict(commit) for commit in new_commits] | ||
| bq_client.insert_rows(table, commit_records) | ||
| bq_client.close() | ||
|
|
||
|
|
||
| def main() -> None: | ||
| github_token = os.environ["GITHUB_TOKEN"] | ||
| grafana_api_key = os.environ["GRAFANA_API_KEY"] | ||
|
|
@@ -227,7 +270,12 @@ def main() -> None: | |
| new_commit_info = query_for_reviews(new_commits, github_token) | ||
|
|
||
| logging.info("Uploading metrics to Grafana.") | ||
| upload_daily_metrics(grafana_api_key, grafana_metrics_userid, new_commit_info) | ||
| upload_daily_metrics_to_grafana( | ||
| grafana_api_key, grafana_metrics_userid, new_commit_info | ||
| ) | ||
|
|
||
| logging.info("Uploading metrics to BigQuery.") | ||
| upload_daily_metrics_to_bigquery(new_commit_info) | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,56 @@ | ||
| [ | ||
| { | ||
| "name": "commit_sha", | ||
| "type": "STRING", | ||
| "mode": "NULLABLE", | ||
| "description": "Commit hexsha of a commit made to llvm/llvm-project:main" | ||
| }, | ||
| { | ||
| "name": "commit_author", | ||
| "type": "STRING", | ||
| "mode": "NULLABLE", | ||
| "description": "GitHub username of the commit author" | ||
| }, | ||
| { | ||
| "name": "commit_timestamp_seconds", | ||
| "type": "INTEGER", | ||
| "mode": "NULLABLE", | ||
| "description": "Time this commit was made at, as a Unix timestamp" | ||
| }, | ||
| { | ||
| "name": "has_pull_request", | ||
| "type": "BOOLEAN", | ||
| "mode": "NULLABLE", | ||
| "description": "Whether or not this commit has an associated pull request" | ||
| }, | ||
| { | ||
| "name": "pull_request_number", | ||
| "type": "INTEGER", | ||
| "mode": "NULLABLE", | ||
| "description": "Number of the pull request associated with this commit" | ||
| }, | ||
| { | ||
| "name": "is_reviewed", | ||
| "type": "BOOLEAN", | ||
| "mode": "NULLABLE", | ||
| "description": "Whether or not the pull request for this commit was reviewed" | ||
| }, | ||
| { | ||
| "name": "is_approved", | ||
| "type": "BOOLEAN", | ||
| "mode": "NULLABLE", | ||
| "description": "Whether or not the pull request for this commit was approved" | ||
| }, | ||
| { | ||
| "name": "reviewers", | ||
| "type": "STRING", | ||
| "mode": "REPEATED", | ||
| "description": "List of GitHub users who reviewed the pull request for this commit" | ||
| }, | ||
| { | ||
| "name": "files_modified", | ||
| "type": "STRING", | ||
| "mode": "REPEATED", | ||
| "description": "List of filepaths modified by this commit" | ||
| } | ||
| ] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why the inconsistency here? You default initialize this, but don't default initialize
commit_sha, commit_timestamp_seconds, orfiles_modified. You also use a set here when bothfiles_modifiedandreviewers` could be represented as sets.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
commit_sha,commit_timestamp_secondsandfiles_modifiedwill always be available from the initial scrape of the repo, so they don't require a default value. There are a handful of fields that will cannot be set if a particular commit does not have an associated PR (can't determine list of reviewers without a PR, for example), so the default values are there to ensure that those fields are populated when uploading to grafana/bigqueryAs far as typing goes, I agree about using sets for both
files_modifiedandreviewers. I've updated the class to reflect that