Skip to content

Commit bb83210

Browse files
committed
fix: use link header to get next issue page
GH pagination doesn't work anymore with large datasets
1 parent 98827a6 commit bb83210

File tree

2 files changed

+20
-13
lines changed

2 files changed

+20
-13
lines changed

issue-bot/src/github.rs

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -202,21 +202,24 @@ impl GithubApi {
202202

203203
pub(crate) fn get_issues(
204204
&self,
205-
from_page: i32,
205+
from_url: Option<String>,
206206
repo_data: RepositoryData,
207-
) -> impl Stream<Item = Result<(IssueWithComments, Option<i32>), GithubApiError>> + use<'_>
207+
) -> impl Stream<Item = Result<(IssueWithComments, Option<String>), GithubApiError>> + use<'_>
208208
{
209209
try_stream! {
210-
let url = format!("https://api.github.com/repos/{}/issues", repo_data.full_name);
211210
let client = self.client.clone();
212-
let mut page = from_page;
211+
let mut url = if let Some(from_url) = from_url {
212+
info!("resuming fetching issues from repo {} at {}", repo_data.full_name, from_url);
213+
from_url
214+
} else {
215+
format!("https://api.github.com/repos/{}/issues", repo_data.full_name)
216+
};
213217
loop {
214218
let res = client
215219
.get(&url)
216220
.query(&[
217221
("state", "all"),
218222
("direction", "desc"),
219-
("page", &page.to_string()),
220223
("per_page", "100"),
221224
])
222225
.send()
@@ -236,8 +239,11 @@ impl GithubApi {
236239
break;
237240
}
238241
};
239-
info!("fetched {} issues from page {}, getting comments for each issue next", issues.len(), page);
242+
info!("fetched {} issues from {}, getting comments for each issue next", issues.len(), url);
240243
let page_issue_count = issues.len();
244+
if let Some(next_url) = get_next_page(link_header.clone())? {
245+
url = next_url;
246+
};
241247
for (i, issue) in issues.into_iter().enumerate() {
242248
loop {
243249
let res = client
@@ -259,14 +265,13 @@ impl GithubApi {
259265
break;
260266
}
261267
};
262-
yield (IssueWithComments::new(issue, comments), (i + 1 == page_issue_count).then_some(page));
268+
yield (IssueWithComments::new(issue, comments), (i + 1 == page_issue_count).then_some(url.clone()));
263269
break;
264270
}
265271
}
266272
if get_next_page(link_header)?.is_none() {
267273
break;
268274
}
269-
page += 1;
270275
}
271276
}
272277
}

issue-bot/src/main.rs

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -268,13 +268,15 @@ struct ClosestIssue {
268268

269269
#[derive(Debug, Deserialize, Serialize)]
270270
enum JobData {
271-
IssueIndexation { issues_page: i32 },
271+
// FIXME: naming is a bit confusing, this means "repository issue indexation"
272+
IssueIndexation { next_url: String },
272273
EmbeddingsRegeneration { current_issue: i32 },
273274
}
274275

275276
#[derive(Debug, sqlx::Type)]
276277
#[sqlx(type_name = "job_type", rename_all = "snake_case")]
277278
enum JobType {
279+
// FIXME: naming is a bit confusing, this means "repository issue indexation"
278280
IssueIndexation,
279281
EmbeddingsRegeneration,
280282
}
@@ -576,11 +578,11 @@ async fn handle_webhooks(
576578
}
577579
};
578580
let from_issues_page =
579-
job.as_ref().and_then(|j| match j.data.0 { JobData::IssueIndexation { issues_page } => Some(issues_page + 1), _ => None}).unwrap_or(1);
581+
job.and_then(|j| match j.data.0 { JobData::IssueIndexation { next_url } => Some(next_url), _ => None});
580582
let issues = github_api.get_issues(from_issues_page, repo_data.clone());
581583
pin_mut!(issues);
582584
while let Some(issue) = issues.next().await {
583-
let (issue, page) = match issue {
585+
let (issue, next_url) = match issue {
584586
Ok(issue) => issue,
585587
Err(err) => {
586588
error!(err = err.to_string(), "error fetching next item from issues stream");
@@ -664,7 +666,7 @@ async fn handle_webhooks(
664666
error!(issue_number = issue.number, err = err.to_string(), "error inserting comments");
665667
}
666668
}
667-
if let Some(page) = page {
669+
if let Some(next_url) = next_url {
668670
if let Err(err) = sqlx::query(
669671
r#"insert into jobs (data, job_type, repository_full_name)
670672
values ($1, $2, $3)
@@ -675,7 +677,7 @@ async fn handle_webhooks(
675677
updated_at = current_timestamp"#,
676678
)
677679
.bind(Json(JobData::IssueIndexation {
678-
issues_page: page,
680+
next_url,
679681
}))
680682
.bind(JobType::IssueIndexation)
681683
.bind(&repo_data.full_name)

0 commit comments

Comments
 (0)