Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 99 additions & 10 deletions augur/tasks/github/repo_info/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,90 @@
data=0

return data

def get_community_health_files(key_auth, logger, owner, repo):
"""get URLs for community health files like CONTRIBUTING.md from repository"""
logger.info('Querying community health files\n')
url = f'https://api.github.com/repos/{owner}/{repo}/community/profile'

try:
github_data_access = GithubDataAccess(key_auth, logger)
data = github_data_access.get_resource(url)

health_files = {
'contributing_file': None,
'security_issue_file': None,
'changelog_file': None
}

if data and 'files' in data:
files = data['files']

if 'contributing' in files and files['contributing']:
health_files['contributing_file'] = files['contributing'].get('html_url')

return health_files

except Exception as e:
logger.warning(f"Could not fetch community health files: {e}")
return {
'contributing_file': None,
'security_issue_file': None,
'changelog_file': None
}

def check_for_security_and_changelog(key_auth, logger, owner, repo):
"""look for SECURITY.md and CHANGELOG files in common repository locations"""
logger.info('Checking for SECURITY and CHANGELOG files\n')

result = {
'security_issue_file': None,
'changelog_file': None
}

security_paths = ['SECURITY.md', '.github/SECURITY.md', 'docs/SECURITY.md']
changelog_paths = ['CHANGELOG.md', 'CHANGELOG', 'CHANGELOG.txt', 'HISTORY.md', 'RELEASES.md']

github_data_access = GithubDataAccess(key_auth, logger)


for path in security_paths:
try:
url = f'https://api.github.com/repos/{owner}/{repo}/contents/{path}'
response = github_data_access.get_resource(url)
if response and 'html_url' in response:
result['security_issue_file'] = response['html_url']
logger.info(f"Found SECURITY file at {path}")
break
except:

Check warning on line 94 in augur/tasks/github/repo_info/core.py

View workflow job for this annotation

GitHub Actions / runner / pylint

[pylint] reported by reviewdog 🐶 W0702: No exception type(s) specified (bare-except) Raw Output: augur/tasks/github/repo_info/core.py:94:8: W0702: No exception type(s) specified (bare-except)
continue


for path in changelog_paths:
try:
url = f'https://api.github.com/repos/{owner}/{repo}/contents/{path}'
response = github_data_access.get_resource(url)
if response and 'html_url' in response:
result['changelog_file'] = response['html_url']
logger.info(f"Found CHANGELOG file at {path}")
break
except:

Check warning on line 106 in augur/tasks/github/repo_info/core.py

View workflow job for this annotation

GitHub Actions / runner / pylint

[pylint] reported by reviewdog 🐶 W0702: No exception type(s) specified (bare-except) Raw Output: augur/tasks/github/repo_info/core.py:106:8: W0702: No exception type(s) specified (bare-except)
continue

return result

def get_repo_topics(repo_data):
"""pull out repository topics and join them into comma-separated string"""
try:
if repo_data and 'topics' in repo_data:
topics = repo_data['topics']
if topics and isinstance(topics, list):
return ','.join(topics)

return None

except Exception:
return None
"""
def get_repo_data(logger, url, response):
data = {}
Expand Down Expand Up @@ -187,34 +271,40 @@

data = github_graphql_data_access.get_resource(query, variables, result_keys)

# Get committers count info that requires seperate endpoint
committers_count = query_committers_count(key_auth, logger, owner, repo)

repo_data = get_repo_data(logger, owner, repo)

health_files = get_community_health_files(key_auth, logger, owner, repo)

security_changelog = check_for_security_and_changelog(key_auth, logger, owner, repo)

topics = get_repo_topics(repo_data)

# Put all data together in format of the table
logger.info(f'Inserting repo info for repo with id:{repo_orm_obj.repo_id}, owner:{owner}, name:{repo}\n')
rep_inf = {
'repo_id': repo_orm_obj.repo_id,
'last_updated': data['updatedAt'] if 'updatedAt' in data else None,
'issues_enabled': data['hasIssuesEnabled'] if 'hasIssuesEnabled' in data else None,
'open_issues': data['issues']['totalCount'] if data['issues'] else None,
'pull_requests_enabled': None,
'pull_requests_enabled': 'true' if repo_data.get('has_projects') else 'false',
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We are not using the .get syntax elsewhere. I am not sure I love the nesting, and I agree it is useful to get these data elements we previously ignored.

Can you share a rationale for this design choice?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

get syntax is better because it wont crash if the key doesnt exist (will return a default value, which is none by default)

I use it all the time

'wiki_enabled': data['hasWikiEnabled'] if 'hasWikiEnabled' in data else None,
'pages_enabled': None,
'pages_enabled': 'true' if repo_data.get('has_pages') else 'false',
'fork_count': data['forkCount'] if 'forkCount' in data else None,
'default_branch': data['defaultBranchRef']['name'] if data['defaultBranchRef'] else None,
'watchers_count': data['watchers']['totalCount'] if data['watchers'] else None,
'license': data['licenseInfo']['name'] if data['licenseInfo'] else None,
'stars_count': data['stargazers']['totalCount'] if data['stargazers'] else None,
'committers_count': committers_count,
'issue_contributors_count': None,
'changelog_file': None,
'contributing_file': None,
'changelog_file': security_changelog.get('changelog_file'),
'contributing_file': health_files.get('contributing_file'),
'license_file': data['licenseInfo']['url'] if data['licenseInfo'] else None,
'code_of_conduct_file': data['codeOfConduct']['url'] if data['codeOfConduct'] else None,
'security_issue_file': None,
'security_issue_file': security_changelog.get('security_issue_file'),
'security_audit_file': None,
'status': None,
'keywords': None,
'keywords': topics,
'commit_count': data['defaultBranchRef']['target']['history']['totalCount'] if data['defaultBranchRef'] else None,
'issues_count': data['issue_count']['totalCount'] if data['issue_count'] else None,
'issues_closed': data['issues_closed']['totalCount'] if data['issues_closed'] else None,
Expand Down Expand Up @@ -244,8 +334,7 @@

execute_sql(insert_statement)

# Note that the addition of information about where a repository may be forked from, and whether a repository is archived, updates the `repo` table, not the `repo_info` table.
repo_data = get_repo_data(logger, owner, repo)


forked = is_forked(logger, repo_data)
archived = is_archived(logger, repo_data)
Expand Down Expand Up @@ -284,7 +373,7 @@

try:
response_data = response.json()
except:

Check warning on line 376 in augur/tasks/github/repo_info/core.py

View workflow job for this annotation

GitHub Actions / runner / pylint

[pylint] reported by reviewdog 🐶 W0702: No exception type(s) specified (bare-except) Raw Output: augur/tasks/github/repo_info/core.py:376:4: W0702: No exception type(s) specified (bare-except)
response_data = json.loads(json.dumps(response.text))

#Insert any data that was returned
Expand Down
Loading