Skip to content

Commit 37086ab

Browse files
JetXu-LLMJet Xu
andauthored
enhance PR content extraction and issue linking (#26)
* arragne logger.py * Add comments to llm_integration/initial_load.py * refactor: Remove torch dependency for device detection - Replace PyTorch-based device detection with a lightweight custom implementation - Add support for CUDA, MPS (Apple Silicon), and CPU detection using standard libraries - Improve portability and reduce overall package size * feat(pr-analysis): enhance PR content extraction and issue linking - Improve issue number extraction with comprehensive regex patterns - Add support for GitHub's official autolink reference formats - Implement commit details extraction in PR content - Add validation for issue numbers to prevent invalid references - Support multiple issue reference formats (URL, #, keywords) This commit enhances the PR analysis capabilities by implementing a more robust issue linking system and detailed commit information extraction, following GitHub's official documentation and best practices. * remove torch * Update extract_related_issues function --------- Co-authored-by: Jet Xu <[email protected]>
1 parent 51eb005 commit 37086ab

File tree

4 files changed

+104
-15
lines changed

4 files changed

+104
-15
lines changed

CHANGELOG.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,18 @@ All notable changes to this project will be documented in this file.
55
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
66
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
77

8+
## [0.1.6] - 2024-10-30
9+
10+
### New Features
11+
- Enhanced PR content analysis with detailed commit information extraction
12+
- Improved issue linking detection with support for multiple reference formats
13+
- Full GitHub URLs, #references, and keyword-based references
14+
- Added validation for issue numbers
15+
16+
### Improvements
17+
- Added detailed commit metadata extraction including stats and file changes
18+
- Enhanced error handling for commit fetching
19+
820
## [0.1.5] - 2024-10-14
921

1022
### Optimized

llama_github/data_retrieval/github_entities.py

Lines changed: 90 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -241,25 +241,79 @@ def get_issue_content(self, number, issue=None) -> str:
241241

242242
def extract_related_issues(self, pr_data: Dict[str, Any]) -> List[int]:
243243
"""
244-
Extracts related issue numbers from the PR description and other fields.
245-
246-
:param pr_data: The pull request data dictionary.
247-
:return: A list of related issue numbers.
244+
Extracts related issue numbers from all PR data following GitHub's reference syntax.
245+
246+
This function implements GitHub's official autolink reference formats to find:
247+
1. Full GitHub issue/PR URLs
248+
2. Numeric references (#123)
249+
3. Keywords + issue references (fixes #123)
250+
4. Repository cross-references (owner/repo#123)
251+
252+
See: https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/autolinked-references-and-urls
253+
254+
Args:
255+
pr_data: Dict[str, Any] - The complete pull request data dictionary
256+
257+
Returns:
258+
List[int] - A sorted list of unique issue numbers found in the PR data
248259
"""
260+
# GitHub's official closing keywords
261+
closing_keywords = (
262+
'close', 'closes', 'closed',
263+
'fix', 'fixes', 'fixed',
264+
'resolve', 'resolves', 'resolved'
265+
)
266+
267+
# Regex patterns for GitHub issue references
249268
patterns = [
250-
rf'https://github\.com/{re.escape(self.full_name)}/issues/(\d+)',
251-
r'(?:^|\s)#(\d+)',
252-
r'(?:^|\s)(\d+)(?:\s|$)',
269+
# Full GitHub issue/PR URL pattern
270+
rf'(?:https?://)?github\.com/{re.escape(self.full_name)}/(?:issues|pull)/(\d+)',
271+
272+
# Standard #123 reference with proper boundaries
273+
r'(?:^|[^\w/])#(\d+)(?=[^\w/]|$)',
274+
275+
# Closing keywords (fixes #123)
276+
fr'(?:^|[^\w/])(?:{"|".join(closing_keywords)}):?\s+#(\d+)(?=[^\w/]|$)',
277+
278+
# Cross-repo reference (owner/repo#123)
279+
rf'{re.escape(self.full_name)}#(\d+)',
280+
281+
# Issue keyword reference (issue #123 or issue: #123)
282+
r'(?:^|[^\w/])(?:issue|bug|ticket|todo|task)s?:?\s+#?(\d+)(?=[^\w/]|$)'
253283
]
284+
254285
issues = set()
255-
# Convert PR data to JSON string for pattern matching
256-
pr_description = json.dumps(pr_data, default=str)
257286

258-
for pattern in patterns:
259-
matches = re.findall(pattern, pr_description)
260-
issues.update(int(match) for match in matches)
287+
def extract_from_text(text: str) -> None:
288+
"""Helper function to extract issue numbers from text"""
289+
if not isinstance(text, str):
290+
return
291+
292+
for pattern in patterns:
293+
matches = re.findall(pattern, text, re.IGNORECASE | re.MULTILINE)
294+
# Validate issue numbers (reasonable length and positive values)
295+
valid_matches = [
296+
int(match) for match in matches
297+
if match.isdigit() and len(match) <= 7 and int(match) > 0
298+
]
299+
issues.update(valid_matches)
300+
301+
def process_value(value: Any) -> None:
302+
"""Recursively process dictionary values and extract issue numbers"""
303+
if isinstance(value, dict):
304+
for v in value.values():
305+
process_value(v)
306+
elif isinstance(value, (list, tuple)):
307+
for item in value:
308+
process_value(item)
309+
elif isinstance(value, str):
310+
extract_from_text(value)
311+
312+
# Process all data in pr_data recursively
313+
process_value(pr_data)
261314

262-
return list(issues)
315+
return sorted(list(issues))
316+
263317

264318
def get_issue_contents(self, issue_numbers: List[int], pr_number: int) -> List[Dict[str, Any]]:
265319
"""
@@ -324,11 +378,34 @@ def get_pr_content(self, number, pr=None, context_lines=10) -> Dict[str, Any]:
324378
"head_branch": pr.head.ref,
325379
},
326380
"related_issues": [],
381+
"commits": [],
327382
"file_changes": [],
328383
"ci_cd_results": [],
329384
"interactions": []
330385
}
331386

387+
# Fetch and process commits
388+
try:
389+
commits = pr.get_commits()
390+
for commit in commits:
391+
commit_data = {
392+
"sha": commit.sha,
393+
"message": commit.commit.message,
394+
"author": commit.commit.author.name,
395+
"date": self.to_isoformat(commit.commit.author.date),
396+
"stats": {
397+
"additions": commit.stats.additions,
398+
"deletions": commit.stats.deletions,
399+
"total": commit.stats.total
400+
},
401+
"files": [f.filename for f in commit.files] # Just keep changed file names
402+
}
403+
pr_data["commits"].append(commit_data)
404+
except GithubException as e:
405+
logger.exception(f"Error fetching commits for PR #{number}")
406+
pr_data["commits"] = []
407+
pr_data["commit_stats"] = {}
408+
332409
# Fetch CI/CD results
333410
try:
334411
last_commit = pr.get_commits().reversed[0]

llama_github/version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = '0.1.5'
1+
__version__ = '0.1.6'

setup.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[metadata]
22
name = llama-github
3-
version = 0.1.5
3+
version = 0.1.6
44
author = Jet Xu
55
author_email = [email protected]
66
description = Llama-github is an open-source Python library that empowers LLM Chatbots, AI Agents, and Auto-dev Agents to conduct Retrieval from actively selected GitHub public projects. It Augments through LLMs and Generates context for any coding question, in order to streamline the development of sophisticated AI-driven applications.

0 commit comments

Comments
 (0)