-
Notifications
You must be signed in to change notification settings - Fork 8
Expand file tree
/
Copy pathposts.ts
More file actions
82 lines (59 loc) · 2.31 KB
/
posts.ts
File metadata and controls
82 lines (59 loc) · 2.31 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import { JSDOM } from 'jsdom';
import { fetchHtml } from '@/modules/parser/scraper/fetch-html';
import { getThreadPagesUrlsForMonth } from '@/modules/parser/scraper/thread';
import { getPostIdFromHref } from '@/utils/strings';
import { SCRAPER } from '@/constants/scraper';
import type { DbCompanyInsert } from '@/types/parser';
export const parseCompaniesForPage = async (pageUrl: string): Promise<DbCompanyInsert[]> => {
const {
postSelector,
titleChildSelector,
linkChildSelector,
companyNameRegex,
removeLinkOrBracesRegex,
} = SCRAPER.posts;
const htmlContent = await fetchHtml(pageUrl);
const doc: Document = new JSDOM(htmlContent).window.document;
// first level posts
const postNodes = doc.querySelectorAll<HTMLTableRowElement>(postSelector);
// todo: throw if postNodes empty
const companies = [];
for (const postNode of postNodes) {
// handle DOM elements first
const titleNode = postNode.querySelector<HTMLDivElement>(titleChildSelector);
const linkNode = postNode.querySelector<HTMLAnchorElement>(linkChildSelector);
// if no element, skip
if (!(titleNode?.textContent && linkNode)) continue;
// 1. company name
const titleText = titleNode.textContent.trim();
const match = titleText.match(companyNameRegex);
let name = match ? match[1].trim() : null;
if (!name) continue;
const urlMatch = name.match(removeLinkOrBracesRegex);
name = urlMatch ? urlMatch[1].trim() : name;
// 2. postId - link
const link = linkNode.href;
const postId = getPostIdFromHref(link); // todo: handle undefined - exception
if (!postId) continue;
const company = { name, postId };
companies.push(company);
}
return companies;
};
/**
* Main function that returns parsed companies for a month.
*
* @param {string} threadUrl - Absolute thread url.
*/
export const parseCompaniesForThread = async (threadUrl: string): Promise<DbCompanyInsert[]> => {
const pagesUrls = await getThreadPagesUrlsForMonth(threadUrl);
const allCompanies: DbCompanyInsert[] = [];
for (const pageUrl of pagesUrls) {
const companies = await parseCompaniesForPage(pageUrl);
allCompanies.push(...companies);
}
const uniqueCompanies = Array.from(
new Map(allCompanies.map((company) => [company.name, company])).values()
);
return uniqueCompanies;
};