-
Notifications
You must be signed in to change notification settings - Fork 8
Expand file tree
/
Copy paththreads.ts
More file actions
88 lines (61 loc) · 2.76 KB
/
threads.ts
File metadata and controls
88 lines (61 loc) · 2.76 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import { JSDOM } from 'jsdom';
import { fetchHtml } from '@/modules/parser/scraper/fetch-html';
import { convertDateToMonthName } from '@/libs/datetime';
import { getPostIdFromHref } from '@/utils/strings';
import { SCRAPER } from '@/constants/scraper';
import type { Thread } from '@/types/parser';
// todo: Support pagination later.
/** Handle just first search page from pagination. 30 items, 10 months. */
export const getThreads = async (): Promise<Thread[]> => {
const {
threadsUrl,
threadPostFirstTrSelector,
threadLinkSelectorTemplate,
threadIdPlaceholder,
hasHiringRegex,
} = SCRAPER.threads;
const htmlContent = await fetchHtml(threadsUrl);
const doc: Document = new JSDOM(htmlContent).window.document;
const threadFirstTrNodes = doc.querySelectorAll<HTMLTableRowElement>(threadPostFirstTrSelector);
const threads = [];
for (const threadFirstTrNode of threadFirstTrNodes) {
// work with these 2 tr nodes bellow, not entire dom
const threadSecondTrNode = threadFirstTrNode?.nextElementSibling;
const threadId = threadFirstTrNode?.id;
if (!(threadFirstTrNode && threadId && threadSecondTrNode)) continue;
// 1. get postId - href
// first tr
// can be reused for both trs
const threadLinkSelector = threadLinkSelectorTemplate.replace(threadIdPlaceholder, threadId);
// the only link in first tr is title
const threadTitleNode = threadFirstTrNode.querySelector<HTMLAnchorElement>(threadLinkSelector);
if (!(threadTitleNode && threadTitleNode.textContent && threadTitleNode.href)) continue;
const { textContent, href } = threadTitleNode;
const postId = getPostIdFromHref(href);
if (!postId) continue;
// search word 'hiring' in the post title
const isHiringPost = hasHiringRegex.test(textContent);
// discard not hiring thread posts
if (!isHiringPost) continue;
// 2. get monthName in format 'yyyy-MM' bellow
// from second tr
const threadsLinkNodes =
threadSecondTrNode.querySelectorAll<HTMLAnchorElement>(threadLinkSelector);
if (!(threadsLinkNodes?.length > 1)) continue;
// there are 2 links, 1st has span parent with date
const dateTitleAttribute = (threadsLinkNodes[0].parentNode as Element)?.getAttribute('title');
if (!dateTitleAttribute) continue;
const dateString = dateTitleAttribute.split(' ')[0];
const dateObject = new Date(dateString);
if (isNaN(dateObject.getTime())) continue;
const monthName = convertDateToMonthName(dateObject);
const thread = { month: monthName, postId };
threads.push(thread);
}
return threads;
};
export const getAllMonths = async (): Promise<string[]> => {
const allThreads = await getThreads();
const allMonths = allThreads.map((thread) => thread.month);
return allMonths;
};