Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 52 additions & 12 deletions socid_extractor/schemes.py
Original file line number Diff line number Diff line change
Expand Up @@ -1239,18 +1239,58 @@
# unactual
'Vimeo HTML': {
'url_hints': ('vimeo.com', 'vimeocdn.com'),
'flags': ['https://i.vimeocdn.com/favicon/main-touch'],
'regex': r'"app_config":({"user":.+?})},\"coach_notes',
'extract_json': True,
'fields': {
'uid': lambda x: x['user']['id'],
'name': lambda x: x['user']['display_name'],
'username': lambda x: x['user']['name'],
'location': lambda x: x['user']['location'],
'created_at': lambda x: x['user']['join_date']['raw'],
'account_type': lambda x: x['user']['account_type'],
'is_staff': lambda x: x['user']['is_staff'],
'links': lambda x: [a['url'] for a in x['user']['links']],
'flags': ['ProfilePage', 'vimeo://app.vimeo.com/users/', 'vimeocdn.com'],
'regex': r'<script type="application/ld\+json">\s*(\[\{[\s\S]*?\}\])\s*</script>',
'extract_json': True,
'transforms': [
json.loads,
lambda x: x[0] if isinstance(x, list) else x,
json.dumps,
],
'fields': {
'uid': lambda x: x['mainEntity'].get('identifier'),
'username': lambda x: x['mainEntity'].get('alternateName'),
'fullname': lambda x: x['mainEntity'].get('name'),
'bio': lambda x: html.unescape(x['mainEntity'].get('description', '') or '') or None,
'image': lambda x: x['mainEntity'].get('image') or None,
'created_at': lambda x: x.get('dateCreated'),
'updated_at': lambda x: x.get('dateModified'),
'follower_count': lambda x: (x['mainEntity'].get('interactionStatistic') or {}).get('userInteractionCount'),
'videos_count': lambda x: (x['mainEntity'].get('agentInteractionStatistic') or {}).get('userInteractionCount'),
'links': lambda x: ', '.join(
link for link in (x['mainEntity'].get('sameAs') or [])
if not link.startswith('https://vimeo.com/')
) or None,
'twitter_url': lambda x: next(
(link for link in (x['mainEntity'].get('sameAs') or [])
if 'twitter.com' in link or 'x.com' in link),
None,
),
'instagram_url': lambda x: next(
(link for link in (x['mainEntity'].get('sameAs') or [])
if 'instagram.com' in link),
None,
),
'facebook_url': lambda x: next(
(link for link in (x['mainEntity'].get('sameAs') or [])
if 'facebook.com' in link),
None,
),
'youtube_url': lambda x: next(
(link for link in (x['mainEntity'].get('sameAs') or [])
if 'youtube.com' in link),
None,
),
'tiktok_url': lambda x: next(
(link for link in (x['mainEntity'].get('sameAs') or [])
if 'tiktok.com' in link),
None,
),
'linkedin_url': lambda x: next(
(link for link in (x['mainEntity'].get('sameAs') or [])
if 'linkedin.com' in link),
None,
),
}
},
'Vimeo GraphQL API': {
Expand Down
51 changes: 34 additions & 17 deletions tests/test_e2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -656,18 +656,29 @@ def test_eyeem():
assert info.get('facebook_uid') == '1610716256'


@pytest.mark.skip(reason="Broken, now API only: https://api.vimeo.com/users/alexaimephotography")
def test_vimeo():
info = extract(parse('https://vimeo.com/alexaimephotography')[0])

assert info.get('uid') == '75857717'
assert info.get('name') == 'AlexAimePhotography'
assert info.get('username') == 'alexaimephotography'
assert info.get('location') == 'France'
assert info.get('created_at') == '2017-12-06 06:49:28'
assert info.get('is_staff') == 'False'
assert info.get(
'links') == "['https://500px.com/alexaimephotography', 'https://www.flickr.com/photos/photoambiance/', 'https://www.instagram.com/alexaimephotography/', 'https://www.youtube.com/channel/UC4NiYV3Yqih2WHcwKg4uPuQ', 'https://flii.by/alexaimephotography/']"
def test_vimeo_html_e2e():
"""Vimeo HTML"""
info = extract(parse('https://vimeo.com/staff')[0])

assert info.get('uid') == '152184'
assert info.get('username') == 'staff'
assert info.get('fullname') == 'Vimeo'
assert 'innovative video experience platform' in info.get('bio', '')
assert 'i.vimeocdn.com/portrait/' in info.get('image', '')
assert info.get('created_at') == '2007-01-18T16:40:11Z'
assert info.get('updated_at')
assert int(info.get('follower_count', 0)) >= 30000
assert int(info.get('videos_count', 0)) >= 2000
# All 6 social crosslinks
assert info.get('twitter_url') == 'https://twitter.com/vimeo'
assert info.get('instagram_url') == 'https://www.instagram.com/vimeo/'
assert info.get('facebook_url') == 'https://www.facebook.com/Vimeo/'
assert info.get('youtube_url') == 'https://www.youtube.com/@vimeo/'
assert info.get('linkedin_url') == 'https://www.linkedin.com/company/vimeo/'
assert info.get('tiktok_url') == 'https://www.tiktok.com/@vimeo?lang=en'
# Combined `links` field excludes self vimeo.com URL
assert 'vimeo.com/staff' not in info.get('links', '')
assert 'twitter.com/vimeo' in info.get('links', '')


@pytest.mark.skip(reason="broken")
Expand Down Expand Up @@ -843,17 +854,23 @@ def test_tiktok_hydration_e2e():
assert 'fullname' in info


@pytest.mark.github_failed
def test_picsart_api_e2e():
"""
Picsart API
"""
info = extract(parse('https://api.picsart.com/users/show/adam.json', timeout=15)[0])
URL = 'https://www.picsart.com/u/adam'
mutated = mutate_url(URL)
assert len(mutated) >= 1
url, add_headers = mutated[0]

info = extract(parse(url, headers=add_headers, timeout=15)[0])

assert info.get('picsart_username') == 'adam'
assert info.get('fullname') == 'Adam'
assert info.get('picsart_id') == '184924161000102'
assert info.get('is_verified') == 'False'
assert int(info.get('follower_count')) >= 0
assert info.get('fullname') is not None
assert info.get('picsart_id') is not None
assert info.get('is_verified') in ('True', 'False')
assert 'follower_count' in info


def test_imgur_api_e2e():
Expand Down
99 changes: 99 additions & 0 deletions tests/test_socid_improvements.py
Original file line number Diff line number Diff line change
Expand Up @@ -1870,3 +1870,102 @@ def test_hive_blog_empty_profile():
assert info.get('bio') is None
assert info.get('image') is None
assert info.get('website') is None


def test_vimeo_html_ld_json():
"""Vimeo HTML: extract profile from ld+json ProfilePage with social crosslinks."""
ld_json = json.dumps([{
"dateCreated": "2006-12-11T19:57:24Z",
"dateModified": "2022-08-01T02:48:55Z",
"url": "https://vimeo.com/testuser",
"mainEntity": {
"@type": "Person",
"name": "Test User",
"identifier": 12345,
"alternateName": "testuser",
"interactionStatistic": {
"@type": "InteractionCounter",
"interactionType": "https://schema.org/FollowAction",
"userInteractionCount": 1621,
},
"agentInteractionStatistic": {
"@type": "InteractionCounter",
"interactionType": "https://schema.org/WriteAction",
"userInteractionCount": 519,
},
"description": "Bio with &#039;quotes&#039; and entities",
"image": "https://i.vimeocdn.com/portrait/12345_640x640",
"url": "/testuser",
"sameAs": [
"https://vimeo.com/testuser",
"http://testuser.example.com",
"http://twitter.com/testuser",
"https://www.instagram.com/testuser/",
"https://www.facebook.com/testuser/",
"https://www.youtube.com/@testuser/",
"https://www.tiktok.com/@testuser",
"https://www.linkedin.com/in/testuser/",
],
},
"potentialAction": {
"@type": "ViewAction",
"target": "vimeo://app.vimeo.com/users/12345",
},
"@type": "ProfilePage",
"@context": "http://schema.org",
}])
html_page = (
'<!DOCTYPE html><html><head>'
f'<script type="application/ld+json">{ld_json}</script>'
'</head><body></body></html>'
)
info = extract(html_page)
assert info.get('uid') == '12345'
assert info.get('username') == 'testuser'
assert info.get('fullname') == 'Test User'
assert info.get('bio') == "Bio with 'quotes' and entities"
assert 'vimeocdn.com' in info.get('image', '')
assert info.get('created_at') == '2006-12-11T19:57:24Z'
assert info.get('updated_at') == '2022-08-01T02:48:55Z'
assert info.get('follower_count') == '1621'
assert info.get('videos_count') == '519'
assert info.get('twitter_url') == 'http://twitter.com/testuser'
assert info.get('instagram_url') == 'https://www.instagram.com/testuser/'
assert info.get('facebook_url') == 'https://www.facebook.com/testuser/'
assert info.get('youtube_url') == 'https://www.youtube.com/@testuser/'
assert info.get('tiktok_url') == 'https://www.tiktok.com/@testuser'
assert info.get('linkedin_url') == 'https://www.linkedin.com/in/testuser/'
# vimeo.com self-link should be excluded from `links`
assert 'vimeo.com/testuser' not in info.get('links', '')
assert 'testuser.example.com' in info.get('links', '')


def test_vimeo_html_minimal_profile():
"""Vimeo HTML: profile with no external links and no description."""
ld_json = json.dumps([{
"dateCreated": "2020-01-01T00:00:00Z",
"dateModified": "2020-01-01T00:00:00Z",
"url": "https://vimeo.com/minimal",
"mainEntity": {
"@type": "Person",
"name": "Minimal",
"identifier": 99999,
"alternateName": "minimal",
"image": "https://i.vimeocdn.com/portrait/default",
"sameAs": ["https://vimeo.com/minimal"],
},
"potentialAction": {"target": "vimeo://app.vimeo.com/users/99999"},
"@type": "ProfilePage",
}])
html_page = (
'<!DOCTYPE html><html><head>'
f'<script type="application/ld+json">{ld_json}</script>'
'</head><body></body></html>'
)
info = extract(html_page)
assert info.get('uid') == '99999'
assert info.get('username') == 'minimal'
assert info.get('fullname') == 'Minimal'
assert info.get('bio') is None
assert info.get('twitter_url') is None
assert info.get('links') is None
Loading