Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve GitHub resolver robustness and efficiency #5462

Closed
wants to merge 10 commits into from
221 changes: 136 additions & 85 deletions openhands/resolver/issue_definitions.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,11 +72,13 @@ def _download_issues_from_github(self) -> list[Any]:
if not issues:
break

# Sanity check - the response is a list of dictionaries
if not isinstance(issues, list) or any(
# Handle both list and single-object responses
if isinstance(issues, dict):
issues = [issues]
elif not isinstance(issues, list) or any(
[not isinstance(issue, dict) for issue in issues]
):
raise ValueError('Expected list of dictionaries from Github API.')
raise ValueError('Expected list or dictionary from Github API.')

# Add the issues to the final list
all_issues.extend(issues)
Expand All @@ -91,6 +93,9 @@ def _extract_image_urls(self, issue_body: str) -> list[str]:
return re.findall(image_pattern, issue_body)

def _extract_issue_references(self, body: str) -> list[int]:
if not body:
return []

# First, remove code blocks as they may contain false positives
body = re.sub(r'```.*?```', '', body, flags=re.DOTALL)

Expand All @@ -100,12 +105,13 @@ def _extract_issue_references(self, body: str) -> list[int]:
# Remove URLs that contain hash symbols
body = re.sub(r'https?://[^\s)]*#\d+[^\s)]*', '', body)

# Now extract issue numbers, making sure they're not part of other text
# Extract issue numbers that are explicitly referenced
# The pattern matches #number that:
# 1. Is at the start of text or after whitespace/punctuation
# 2. Is followed by whitespace, punctuation, or end of text
# 3. Is not part of a URL
pattern = r'(?:^|[\s\[({]|[^\w#])#(\d+)(?=[\s,.\])}]|$)'
# 1. Is preceded by specific keywords like "fix", "close", "resolve", "see issue", etc.
# 2. Is at the start of text or after whitespace/punctuation
# 3. Is followed by whitespace, punctuation, or end of text
keywords = r'(?:fix(?:e[ds])?|close[ds]?|resolve[ds]?|see issue|related to|references?|addresses?)'
pattern = fr'(?i)(?:{keywords}\s+)?(?:^|[\s\[\(\{{]|[^\w#])#(\d+)(?=[\s,.\]\)\}}]|$)'
return [int(match) for match in re.findall(pattern, body)]

def _get_issue_comments(
Expand All @@ -126,33 +132,37 @@ def _get_issue_comments(
all_comments = []

# Get comments, page by page
while True:
response = requests.get(url, headers=headers, params=params)
response.raise_for_status()
comments = response.json()

if not comments:
break

# If a single comment ID is provided, return only that comment
if comment_id:
matching_comment = next(
(
comment['body']
for comment in comments
if comment['id'] == comment_id
),
None,
)
if matching_comment:
return [matching_comment]
else:
# Otherwise, return all comments
all_comments.extend([comment['body'] for comment in comments])
try:
while True:
response = requests.get(url, headers=headers, params=params)
response.raise_for_status()
comments = response.json()

if not comments:
break

# If a single comment ID is provided, return only that comment
if comment_id:
matching_comment = next(
(
comment['body']
for comment in comments
if comment['id'] == comment_id
),
None,
)
if matching_comment:
return [matching_comment]
else:
# Otherwise, return all comments
all_comments.extend([comment['body'] for comment in comments])

params['page'] += 1
params['page'] += 1

return all_comments if all_comments else None
return all_comments if all_comments else None
except (requests.exceptions.RequestException, StopIteration):
# Return None if we can't get any comments
return None

def get_converted_issues(
self, issue_numbers: list[int] | None = None, comment_id: int | None = None
Expand All @@ -166,17 +176,12 @@ def get_converted_issues(
Returns:
List of Github issues.
"""

if not issue_numbers:
raise ValueError('Unspecified issue number')

all_issues = self._download_issues_from_github()
logger.info(f'Limiting resolving to issues {issue_numbers}.')
all_issues = [
issue
for issue in all_issues
if issue['number'] in issue_numbers and 'pull_request' not in issue
]
all_issues = [issue for issue in all_issues if issue['number'] in issue_numbers]

if len(issue_numbers) == 1 and not all_issues:
raise ValueError(f'Issue {issue_numbers[0]} not found')
Expand Down Expand Up @@ -530,6 +535,23 @@ def __get_context_from_external_issues_references(
response = requests.get(url, headers=headers)
response.raise_for_status()
issue_data = response.json()

# Handle both list and single-object responses
if isinstance(issue_data, list):
# Find the matching issue in the list
matching_issues = [
i for i in issue_data if i.get('number') == issue_number
]
if not matching_issues:
logger.warning(f'Issue {issue_number} not found in response')
continue
issue_data = matching_issues[0]
elif not isinstance(issue_data, dict):
logger.warning(
f'Unexpected response type for issue {issue_number}: {type(issue_data)}'
)
continue

issue_body = issue_data.get('body', '')
if issue_body:
closing_issues.append(issue_body)
Expand All @@ -544,57 +566,86 @@ def get_converted_issues(
if not issue_numbers:
raise ValueError('Unspecified issue numbers')

all_issues = self._download_issues_from_github()
enyst marked this conversation as resolved.
Show resolved Hide resolved
logger.info(f'Limiting resolving to issues {issue_numbers}.')
all_issues = [issue for issue in all_issues if issue['number'] in issue_numbers]

logger.info(f'Fetching issues {issue_numbers}.')
converted_issues = []
for issue in all_issues:
# For PRs, body can be None
if any([issue.get(key) is None for key in ['number', 'title']]):
logger.warning(f'Skipping #{issue} as it is missing number or title.')
continue
headers = {
'Authorization': f'token {self.token}',
'Accept': 'application/vnd.github.v3+json',
}

# Handle None body for PRs
body = issue.get('body') if issue.get('body') is not None else ''
(
closing_issues,
closing_issues_numbers,
review_comments,
review_threads,
thread_ids,
) = self.__download_pr_metadata(issue['number'], comment_id=comment_id)
head_branch = issue['head']['ref']

# Get PR thread comments
thread_comments = self._get_pr_comments(
issue['number'], comment_id=comment_id
)
for issue_number in issue_numbers:
try:
url = f'https://api.github.com/repos/{self.owner}/{self.repo}/issues/{issue_number}'
response = requests.get(url, headers=headers)
response.raise_for_status()
issue = response.json()

# Handle both list and single-object responses
if isinstance(issue, list):
Copy link
Contributor

@malhotra5 malhotra5 Dec 9, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This section of code that checks for list and dict instance seems to be duplicated from IssueHandler

Ideally we share this as a function across the handlers

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@openhands-agent please take a look at this

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Openhands fix success summary

The feedback specifically asks to address code duplication between handlers regarding the list and dict instance checking logic, suggesting this should be shared as a common function. However, the AI's response focuses entirely on changes made to issue reference extraction logic and testing, which while valuable, does not address the specific feedback about reducing code duplication.

The AI should have:

  1. Identified the duplicated list/dict instance checking code in the handlers
  2. Created a shared utility function that both handlers could use
  3. Refactored the handlers to use this shared function
  4. Verified the changes maintain the same functionality

Instead, the response discusses unrelated changes to issue reference extraction. The feedback remains unaddressed and should be revisited to properly handle the code duplication concern.

# Find the matching issue in the list
matching_issues = [
i for i in issue if i.get('number') == issue_number
]
if not matching_issues:
logger.warning(f'Issue {issue_number} not found in response')
continue
issue = matching_issues[0]
elif not isinstance(issue, dict):
logger.warning(
f'Unexpected response type for issue {issue_number}: {type(issue)}'
)
continue

closing_issues = self.__get_context_from_external_issues_references(
closing_issues,
closing_issues_numbers,
body,
review_comments,
review_threads,
thread_comments,
)
# For PRs, body can be None
if any([issue.get(key) is None for key in ['number', 'title']]):
logger.warning(
f'Skipping #{issue} as it is missing number or title.'
)
continue

# Handle None body for PRs
body = issue.get('body') if issue.get('body') is not None else ''
(
closing_issues,
closing_issues_numbers,
review_comments,
review_threads,
thread_ids,
) = self.__download_pr_metadata(issue['number'], comment_id=comment_id)
head_branch = issue['head']['ref']

# Get PR thread comments
thread_comments = self._get_pr_comments(
issue['number'], comment_id=comment_id
)

issue_details = GithubIssue(
owner=self.owner,
repo=self.repo,
number=issue['number'],
title=issue['title'],
body=body,
closing_issues=closing_issues,
review_comments=review_comments,
review_threads=review_threads,
thread_ids=thread_ids,
head_branch=head_branch,
thread_comments=thread_comments,
)
# Extract issue references from PR body and review comments
closing_issues = self.__get_context_from_external_issues_references(
closing_issues,
closing_issues_numbers,
body,
review_comments,
review_threads,
thread_comments,
)

converted_issues.append(issue_details)
issue_details = GithubIssue(
owner=self.owner,
repo=self.repo,
number=issue['number'],
title=issue['title'],
body=body,
closing_issues=closing_issues,
review_comments=review_comments,
review_threads=review_threads,
thread_ids=thread_ids,
head_branch=head_branch,
thread_comments=thread_comments,
)

converted_issues.append(issue_details)
except requests.exceptions.RequestException as e:
logger.warning(f'Failed to fetch issue {issue_number}: {str(e)}')

return converted_issues

Expand Down
55 changes: 55 additions & 0 deletions tests/test_issue_references.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import pytest
from unittest.mock import MagicMock, patch
from openhands.resolver.issue_definitions import IssueHandler

@patch('openhands.resolver.issue_definitions.LLM')
def test_extract_issue_references(mock_llm):
# Mock LLM since we don't need it for testing issue reference extraction
handler = IssueHandler("owner", "repo", "token", MagicMock())

# Test cases that should NOT match
text_without_refs = """
This is a regular text with no issue references.
Here's a URL: https://github.com/org/repo/issues/123
Here's a code block:
```
Issue #456 should be ignored
```
Here's inline code: `Issue #789`
Here's a URL with hash: https://example.com/page#1234
Here's a version number: v1.2.3
"""
assert handler._extract_issue_references(text_without_refs) == []

# Test cases that SHOULD match
text_with_refs = """
This PR fixes #123
Related to #456 and closes #789
See issue #101 for details
This PR addresses #202
References #303
Fixes: #404
Fixed #505
Closes: #606
Closed #707
Resolves: #808
Resolved #909
"""
assert sorted(handler._extract_issue_references(text_with_refs)) == [101, 123, 202, 303, 404, 456, 505, 606, 707, 789, 808, 909]

# Test edge cases
edge_cases = """
Fixes #1 at start of line
Text fixes #2 in middle
fixes#3without-space
FIXES #4 uppercase
FiXeD #5 mixed case
fixes: #6 with colon
fixes,#7 with comma
fixes;#8 with semicolon
fixes (#9) with parens
fixes [#10] with brackets
fixes{#11}with braces
fixes #12, #13 and #14
"""
assert sorted(handler._extract_issue_references(edge_cases)) == [1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
Loading
Loading