Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 56 additions & 8 deletions llvm-ops-metrics/ops-container/process_llvm_commits.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import logging
import os
import git
from google.cloud import bigquery
import requests

GRAFANA_URL = (
Expand All @@ -11,6 +12,10 @@
GITHUB_GRAPHQL_API_URL = "https://api.github.com/graphql"
REPOSITORY_URL = "https://github.com/llvm/llvm-project.git"

# BigQuery dataset and tables to write metrics to.
OPERATIONAL_METRICS_DATASET = "operational_metrics"
LLVM_COMMITS_TABLE = "llvm_commits"

# How many commits to query the GitHub GraphQL API for at a time.
# Querying too many commits at once often leads to the call failing.
GITHUB_API_BATCH_SIZE = 50
Expand All @@ -27,11 +32,23 @@
commit_{commit_sha}:
object(oid:"{commit_sha}") {{
... on Commit {{
author {{
user {{
login
}}
}}
associatedPullRequests(first: 1) {{
totalCount
pullRequest: nodes {{
number
reviewDecision
reviews(first: 10) {{
nodes {{
reviewer: author {{
login
}}
}}
}}
}}
}}
}}
Expand All @@ -42,12 +59,14 @@
@dataclasses.dataclass
class LLVMCommitInfo:
commit_sha: str
commit_datetime: datetime.datetime
commit_timestamp_seconds: int
files_modified: list[str]
commit_author: str = "" # GitHub username of author is unknown until API call
has_pull_request: bool = False
pr_number: int = 0
pull_request_number: int = 0
is_reviewed: bool = False
is_approved: bool = False
reviewers: set[str] = dataclasses.field(default_factory=set)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why the inconsistency here? You default initialize this, but don't default initialize commit_sha, commit_timestamp_seconds, or files_modified. You also use a set here when both files_modifiedandreviewers` could be represented as sets.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

commit_sha, commit_timestamp_seconds and files_modified will always be available from the initial scrape of the repo, so they don't require a default value. There are a handful of fields that will cannot be set if a particular commit does not have an associated PR (can't determine list of reviewers without a PR, for example), so the default values are there to ensure that those fields are populated when uploading to grafana/bigquery

As far as typing goes, I agree about using sets for both files_modified and reviewers. I've updated the class to reflect that



def scrape_new_commits_by_date(
Expand Down Expand Up @@ -99,7 +118,9 @@ def query_for_reviews(
# Create a map of commit sha to info
new_commits = {
commit.hexsha: LLVMCommitInfo(
commit.hexsha, commit.committed_datetime, commit.committed_date
commit_sha=commit.hexsha,
commit_timestamp_seconds=commit.committed_date,
files_modified=list(commit.stats.files.keys()),
)
for commit in new_commits
}
Expand Down Expand Up @@ -142,27 +163,33 @@ def query_for_reviews(
)
if response.status_code < 200 or response.status_code >= 300:
logging.error("Failed to query GitHub GraphQL API: %s", response.text)
exit(1)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add a comment on why we want to fail hard instead of gracefully continuing?

I'm presuming because if we miss an entire batch that's a pretty large chunk of data so failing gracefully doesn't make much sense?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's correct, I've added a comment per your suggestion

api_commit_data.update(response.json()["data"]["repository"])

# Amend commit information with GitHub data
for commit_sha, data in api_commit_data.items():
# Verify that push commit has no pull requests
commit_sha = commit_sha.removeprefix("commit_")
commit_info = new_commits[commit_sha]
commit_info.commit_author = data["author"]["user"]["login"]

# If commit has no pull requests, skip it. No data to update.
if data["associatedPullRequests"]["totalCount"] == 0:
continue

pull_request = data["associatedPullRequests"]["pullRequest"][0]
commit_info = new_commits[commit_sha]
commit_info.has_pull_request = True
commit_info.pr_number = pull_request["number"]
commit_info.pull_request_number = pull_request["number"]
commit_info.is_reviewed = pull_request["reviewDecision"] is not None
commit_info.is_approved = pull_request["reviewDecision"] == "APPROVED"
commit_info.reviewers = set([
review["reviewer"]["login"]
for review in pull_request["reviews"]["nodes"]
])

return list(new_commits.values())


def upload_daily_metrics(
def upload_daily_metrics_to_grafana(
grafana_api_key: str,
grafana_metrics_userid: str,
new_commits: list[LLVMCommitInfo],
Expand Down Expand Up @@ -205,6 +232,22 @@ def upload_daily_metrics(
logging.error("Failed to submit data to Grafana: %s", response.text)


def upload_daily_metrics_to_bigquery(new_commits: list[LLVMCommitInfo]) -> None:
"""Upload processed commit metrics to a BigQuery dataset.

Args:
new_commits: List of commits to process & upload to BigQuery.
"""
bq_client = bigquery.Client()
table_ref = bq_client.dataset(OPERATIONAL_METRICS_DATASET).table(
LLVM_COMMITS_TABLE
)
table = bq_client.get_table(table_ref)
commit_records = [dataclasses.asdict(commit) for commit in new_commits]
bq_client.insert_rows(table, commit_records)
bq_client.close()


def main() -> None:
github_token = os.environ["GITHUB_TOKEN"]
grafana_api_key = os.environ["GRAFANA_API_KEY"]
Expand All @@ -227,7 +270,12 @@ def main() -> None:
new_commit_info = query_for_reviews(new_commits, github_token)

logging.info("Uploading metrics to Grafana.")
upload_daily_metrics(grafana_api_key, grafana_metrics_userid, new_commit_info)
upload_daily_metrics_to_grafana(
grafana_api_key, grafana_metrics_userid, new_commit_info
)

logging.info("Uploading metrics to BigQuery.")
upload_daily_metrics_to_bigquery(new_commit_info)


if __name__ == "__main__":
Expand Down
56 changes: 56 additions & 0 deletions premerge/bigquery_schema/llvm_commits_table_schema.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
[
{
"name": "commit_sha",
"type": "STRING",
"mode": "NULLABLE",
"description": "Commit hexsha of a commit made to llvm/llvm-project:main"
},
{
"name": "commit_author",
"type": "STRING",
"mode": "NULLABLE",
"description": "GitHub username of the commit author"
},
{
"name": "commit_timestamp_seconds",
"type": "INTEGER",
"mode": "NULLABLE",
"description": "Time this commit was made at, as a Unix timestamp"
},
{
"name": "has_pull_request",
"type": "BOOLEAN",
"mode": "NULLABLE",
"description": "Whether or not this commit has an associated pull request"
},
{
"name": "pull_request_number",
"type": "INTEGER",
"mode": "NULLABLE",
"description": "Number of the pull request associated with this commit"
},
{
"name": "is_reviewed",
"type": "BOOLEAN",
"mode": "NULLABLE",
"description": "Whether or not the pull request for this commit was reviewed"
},
{
"name": "is_approved",
"type": "BOOLEAN",
"mode": "NULLABLE",
"description": "Whether or not the pull request for this commit was approved"
},
{
"name": "reviewers",
"type": "STRING",
"mode": "REPEATED",
"description": "List of GitHub users who reviewed the pull request for this commit"
},
{
"name": "files_modified",
"type": "STRING",
"mode": "REPEATED",
"description": "List of filepaths modified by this commit"
}
]
27 changes: 27 additions & 0 deletions premerge/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -293,3 +293,30 @@ resource "kubernetes_manifest" "operational_metrics_cronjob" {
kubernetes_service_account.operational_metrics_ksa,
]
}

# BigQuery dataset and table resources
resource "google_bigquery_dataset" "operational_metrics_dataset" {
dataset_id = "operational_metrics"
description = "Dataset for retaining operational data regarding LLVM commit trends."
}

resource "google_bigquery_table" "llvm_commits_table" {
dataset_id = google_bigquery_dataset.operational_metrics_dataset.dataset_id
table_id = "llvm_commits"
description = "LLVM commit data, including pull request and review activity per commit."

schema = file("./bigquery_schema/llvm_commits_table_schema.json")

depends_on = [google_bigquery_dataset.operational_metrics_dataset]
}

resource "google_bigquery_dataset_iam_binding" "operational_metrics_dataset_editor_binding" {
dataset_id = google_bigquery_dataset.operational_metrics_dataset.dataset_id
role = "roles/bigquery.dataEditor"

members = [
"serviceAccount:${google_service_account.operational_metrics_gsa.email}",
]

depends_on = [google_bigquery_dataset.operational_metrics_dataset, google_service_account.operational_metrics_gsa]
}