Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 12 additions & 2 deletions socid_extractor/main.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import html as html_module
import logging
from http.cookies import SimpleCookie

Expand Down Expand Up @@ -120,7 +121,15 @@ def extract(page):
else:
groupdict = regexp_group.groupdict()
if groupdict:
values = groupdict
values = {}
for k, v in groupdict.items():
if k.endswith('_raw'):
base = k[:-4]
if not values.get(base):
values[base] = v
else:
if not values.get(k):
values[k] = v
else:
extracted = regexp_group.group(1)
logging.debug('Extracted: %s', extracted)
Expand All @@ -145,7 +154,8 @@ def extract(page):
except PROCESS_ERRORS as e:
logging.debug('Postprocess error: ', e)

return {a: b for a, b in values.items() if b or type(b) == bool}
return {a: html_module.unescape(b) if isinstance(b, str) else b
for a, b in values.items() if b or type(b) == bool}

# all schemes have been checked
return {}
2 changes: 1 addition & 1 deletion socid_extractor/schemes.py
Original file line number Diff line number Diff line change
Expand Up @@ -2490,7 +2490,7 @@
'Threads': {
'url_hints': ('threads.net', 'threads.com'),
'flags': ['Threads, Say more', 'barcelona'],
'regex': r'og:title" content="(?P<fullname>[^"]*?) \((?:@|&#064;)(?P<username>[^)]+)\)[^"]*Threads[\s\S]*?og:description" content="(?P<follower_count>[\d,]+) Followers[^"]*?(?P<posts_count>[\d,]+) Threads[^"]*?(?:&quot;|")(?P<bio>[^"&]*)',
'regex': r'og:title" content="(?P<fullname>[^"]*?) \((?:@|&#064;)(?P<username>[^)]+)\)[^"]*Threads[\s\S]*?og:description" content="(?P<follower_count>[\d,]+) Followers[^"]*?(?P<posts_count>[\d,]+) Threads(?:[^"]*?(?:&#x2022;|\xb7) (?:&quot;(?P<bio>[^&]+)&quot;|(?P<bio_raw>[^".]*)\.))?',
},
'Smule': {
'url_hints': ('smule.com',),
Expand Down
60 changes: 57 additions & 3 deletions tests/test_socid_improvements.py
Original file line number Diff line number Diff line change
Expand Up @@ -931,8 +931,8 @@ def test_threads_profile_extraction():
"""Threads: extract fullname, username, follower/post counts, bio from OG tags."""
html = (
'<!DOCTYPE html><html><head>'
'<meta property="og:title" content="Mark Zuckerberg (@zuck) · Threads, Say more">'
'<meta property="og:description" content="12,500 Followers · 340 Threads · &quot;CEO of Meta&quot;">'
'<meta property="og:title" content="Mark Zuckerberg (&#064;zuck) &#x2022; Threads, Say more">'
'<meta property="og:description" content="12,500 Followers &#x2022; 340 Threads &#x2022; &quot;CEO of Meta&quot;. See the latest.">'
'</head><body>'
'<div class="barcelona">content</div>'
'</body></html>'
Expand Down Expand Up @@ -1190,12 +1190,66 @@ def test_threads_profile():
)
info = extract(html)
assert info.get('username') == 'fusteee'
assert info.get('fullname') == 'Marc Fuste&#xe9;'
assert info.get('fullname') == 'Marc Fusteé'
assert info.get('follower_count') == '33'
assert info.get('posts_count') == '0'
assert info.get('bio') == 'Regalame tus mejores noches'


def test_threads_profile_no_bio():
"""Threads: profile without bio should not capture HTML garbage."""
html = (
'<!DOCTYPE html><html><head>'
'<meta property="og:title" content="Alice (&#064;alice_test) &#x2022; Threads, Say more">'
'<meta property="og:description" content="5 Followers &#x2022; 0 Threads. See the latest conversations with &#064;alice_test.">'
'</head><body>'
'<div class="barcelona">content</div>'
'</body></html>'
)
info = extract(html)
assert info.get('username') == 'alice_test'
assert info.get('fullname') == 'Alice'
assert info.get('follower_count') == '5'
assert info.get('posts_count') == '0'
assert not info.get('bio')


def test_threads_profile_unicode_name():
"""Threads: fullname with unicode HTML entities should be decoded."""
html = (
'<!DOCTYPE html><html><head>'
'<meta property="og:title" content="&#x1d4d0;&#x1d4fb;&#x1d4fd; (&#064;bob_test) &#x2022; Threads, Say more">'
'<meta property="og:description" content="10 Followers &#x2022; 3 Threads &#x2022; &quot;hello&quot;. See the latest conversations.">'
'</head><body>'
'<div class="barcelona">content</div>'
'</body></html>'
)
info = extract(html)
assert info.get('username') == 'bob_test'
assert info.get('fullname') == '\U0001d4d0\U0001d4fb\U0001d4fd'
assert info.get('follower_count') == '10'
assert info.get('posts_count') == '3'
assert info.get('bio') == 'hello'


def test_threads_profile_emoji_bio():
"""Threads: emoji bio without quotes should be extracted, not lost."""
html = (
'<!DOCTYPE html><html><head>'
'<meta property="og:title" content="Eve (&#064;eve_test) &#x2022; Threads, Say more">'
'<meta property="og:description" content="21 Followers &#x2022; 4 Threads &#x2022; &#x1f92b;. See the latest conversations.">'
'</head><body>'
'<div class="barcelona">content</div>'
'</body></html>'
)
info = extract(html)
assert info.get('username') == 'eve_test'
assert info.get('follower_count') == '21'
assert info.get('posts_count') == '4'
assert info.get('bio') == '🤫'
assert not info.get('bio_raw')


def test_chess_com_html_profile():
"""Chess.com HTML: extract fullname, username and image from og:meta."""
html = (
Expand Down
Loading