Skip to content

Commit d266950

Browse files
committed
Merge branch 'main' into ete4
2 parents 1e0376f + e3c93e0 commit d266950

File tree

3 files changed

+618
-602
lines changed

3 files changed

+618
-602
lines changed

har2tree/helper.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -277,6 +277,14 @@ def find_identifiers(html_doc: bytes) -> dict[str, list[str]] | None:
277277
else:
278278
to_return[captcha].append(sitekey)
279279

280+
# This is beta and kinda fragile, but it's going to find (most) of the google tag IDs
281+
# https://support.google.com/google-ads/answer/12326985?hl=en_us_us
282+
# NOTE: the doc says 9 X, but all the examples I found have 10 X so we cannot trust it
283+
if google_tag_ids := set(re.findall(rb"(?:G-|AW-|GA-|UA-)\w{9,13}", html_doc)):
284+
blocklist = {b'UA-Compatible'}
285+
google_tag_ids -= blocklist
286+
to_return['google_tag_ids'] = [i.decode() for i in google_tag_ids]
287+
280288
return to_return
281289

282290

0 commit comments

Comments
 (0)