-
Notifications
You must be signed in to change notification settings - Fork 1
twitter : new stream to extract tweets tagging us #35
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
068cfd8
96d85f2
2d99f81
2bbaab3
272860c
7ac4a7a
36ecd75
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,24 +1,12 @@ | ||
{ | ||
"credentials":{ | ||
"client_id": "some-id", | ||
"client_secret": "some-secret", | ||
"access_token": "some-access-token", | ||
"refresh_token": "some-refresh-token", | ||
"token_expiry_date": "" | ||
"client_id": "your_client_id", | ||
"client_secret": "your_client_secret", | ||
"access_token": "your_access_token", | ||
"refresh_token": "your_refresh_token", | ||
"token_expiry_date": "2024-12-31T00:00:00Z" | ||
}, | ||
"account_id": "123456789", | ||
"account_id": "your_account_id", | ||
"start_time": "2024-01-01T00:00:00Z", | ||
"comment_days_limit": 2, | ||
"filtered_author_ids": [ | ||
"1417373828544487426", | ||
"1527270456658632706", | ||
"1573349900905054212", | ||
"1636287274961829888", | ||
"1101033576454340608", | ||
"1151831284110385152", | ||
"1083104775825252353", | ||
"774689518767181828", | ||
"1783824207631077376", | ||
"18904639" | ||
] | ||
"tags": ["@IFT", "@status", "@Airbyte"] | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
{ | ||
"$schema": "http://json-schema.org/draft-07/schema#", | ||
"type": "object", | ||
"properties": { | ||
"id": { | ||
"type": ["null", "string"] | ||
}, | ||
"text": { | ||
"type": ["null", "string"] | ||
}, | ||
"created_at": { | ||
"type": ["null", "string"] | ||
}, | ||
"author_id": { | ||
"type": ["null", "string"] | ||
}, | ||
"author_username": { | ||
"type": ["null", "string"], | ||
"description": "The Twitter handle/username of the tweet author (e.g., 'john_doe')" | ||
}, | ||
"author_name": { | ||
"type": ["null", "string"], | ||
"description": "The display name of the tweet author (e.g., 'John Doe')" | ||
}, | ||
"author_verified": { | ||
"type": ["null", "boolean"], | ||
"description": "Whether the tweet author is verified" | ||
}, | ||
"conversation_id": { | ||
"type": ["null", "string"] | ||
}, | ||
"reply_settings": { | ||
"type": ["null", "string"] | ||
}, | ||
"matched_tag": { | ||
"type": ["null", "string"], | ||
"description": "The tag that matched this tweet (e.g., '@IFT' or '@status')" | ||
}, | ||
"referenced_tweets": { | ||
"type": ["null", "array"], | ||
"items": { | ||
"type": ["object"], | ||
"properties":{ | ||
"type": { | ||
"type": ["null", "string"] | ||
}, | ||
"id": { | ||
"type": ["null", "string"] | ||
} | ||
} | ||
} | ||
}, | ||
"public_metrics": { | ||
"type": ["null", "object"], | ||
"properties": { | ||
"retweet_count": { | ||
"type": ["null", "number"] | ||
}, | ||
"reply_count": { | ||
"type": ["null", "number"] | ||
}, | ||
"like_count": { | ||
"type": ["null", "number"] | ||
}, | ||
"quote_count": { | ||
"type": ["null", "number"] | ||
}, | ||
"impression_count": { | ||
"type": ["null", "number"] | ||
}, | ||
"bookmark_count": { | ||
"type": ["null", "number"] | ||
} | ||
} | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,7 +6,7 @@ connectionSpecification: | |
required: | ||
- credentials | ||
- account_id | ||
- start_time | ||
- tags | ||
properties: | ||
credentials: | ||
title: Twitter Dev account Credentials | ||
|
@@ -42,19 +42,17 @@ connectionSpecification: | |
description: "Id of the Twitter Account to fetch info from" | ||
start_time: | ||
type: string | ||
description: "Start date of fetching data" | ||
description: "Start date of fetching data. If not provided, defaults to 5 days before current time." | ||
format: datetime | ||
comment_days_limit: | ||
type: integer | ||
title: "Comment Days Limit" | ||
description: "Number of days to look back for comments on tweets (default: 2)" | ||
default: 2 | ||
minimum: 1 | ||
maximum: 7 | ||
filtered_author_ids: | ||
type: array | ||
title: "Filtered Author IDs" | ||
description: "List of Twitter author IDs to filter out from comments (e.g., your own organization's account IDs)" | ||
tags: | ||
type: array | ||
title: "Tags to Monitor" | ||
description: "List of Twitter handles to monitor (e.g., ['@IFT', '@Airbyte'])" | ||
items: | ||
type: string | ||
default: [] | ||
type: string | ||
minItems: 1 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why minimum 1 item ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Guess i will never know :'( There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If there is no tag the connector will not work the "for tag in self.tags" loop never executes also the goal of the connector is to look for tags it doesn't make sense to have it empty that's why it is also a mandatory parameter |
||
tags_frequent_extractions: | ||
type: boolean | ||
title: "Tags Frequent Extractions" | ||
description: "If true, defaults start_time to 1 hour 15 minutes before current time for more frequent extractions. If false, defaults to 5 days before current time." | ||
default: false |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
from typing import Any, Iterable, Mapping, MutableMapping, Optional, List | ||
import logging | ||
import requests | ||
import time | ||
from datetime import datetime, timedelta | ||
from airbyte_cdk.sources.streams import Stream | ||
from airbyte_cdk.sources.streams.http import HttpStream | ||
|
||
from .tweets_stream import TwitterStream | ||
|
||
logger = logging.getLogger("airbyte") | ||
|
||
class TagsStream(TwitterStream): | ||
primary_key = "id" | ||
|
||
def __init__(self, start_time: str = None, account_id: str = None, tags: List[str] = None, tags_frequent_extractions: bool = False, **kwargs): | ||
super().__init__(start_time=start_time, account_id=account_id, **kwargs) | ||
|
||
if not self.start_time: | ||
if tags_frequent_extractions: | ||
# Default to 1 hour 15 minutes before current time | ||
self.start_time = datetime.utcnow() - timedelta(hours=1, minutes=15) | ||
else: | ||
# Default to 5 days before current time | ||
self.start_time = datetime.utcnow() - timedelta(days=5) | ||
|
||
self.tags = tags or [] | ||
|
||
def stream_slices(self, **kwargs) -> Iterable[Optional[Mapping[str, Any]]]: | ||
for tag in self.tags: | ||
yield {"tag": tag} | ||
|
||
def path( | ||
self, | ||
stream_state: Mapping[str, Any] = None, | ||
stream_slice: Mapping[str, Any] = None, | ||
next_page_token: Mapping[str, Any] = None | ||
) -> str: | ||
return "tweets/search/recent" # this endpoint fetches data from the last 7 days | ||
|
||
def next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]: | ||
if 'meta' in response.json() and 'next_token' in response.json()['meta'] and response.json()['meta']['result_count'] > 0: | ||
logger.debug('DBG-NT: %s', response.json()['meta']['next_token']) | ||
return {"next_token": response.json()['meta']['next_token']} | ||
Comment on lines
+41
to
+44
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should probably be mutualized in TwitterStream There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What do you thing ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree ! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In the end it can't be mutulalized here because :
|
||
|
||
def request_params( | ||
self, | ||
next_page_token: Optional[Mapping[str, Any]] = None, | ||
stream_state: Mapping[str, Any] = None, | ||
stream_slice: Mapping[str, Any] = None | ||
) -> MutableMapping[str, Any]: | ||
tag = stream_slice["tag"] | ||
params = { | ||
"query": tag, | ||
"tweet.fields": "text,public_metrics,author_id,referenced_tweets,created_at", | ||
"expansions": "author_id", | ||
"user.fields": "username,name,verified,public_metrics", | ||
"max_results": 100 | ||
} | ||
params.update({"start_time": self.start_time.strftime("%Y-%m-%dT%H:%M:%SZ")}) | ||
if next_page_token: | ||
params.update(**next_page_token) | ||
return params | ||
|
||
def parse_response( | ||
self, | ||
response: requests.Response, | ||
stream_slice: Mapping[str, Any] = None, | ||
**kwargs | ||
) -> Iterable[Mapping]: | ||
logger.debug("Full response %s", response.json()) | ||
response_data = response.json() | ||
|
||
# Create a mapping of user_id to user info for quick lookup because user data is returned separately in the includes.users array, you need to manually join them using the author_id as the key | ||
users_map = {} | ||
if 'includes' in response_data and 'users' in response_data['includes']: | ||
for user in response_data['includes']['users']: | ||
users_map[user['id']] = user | ||
|
||
if 'data' in response_data: | ||
data = response_data['data'] | ||
for t in data: | ||
t["matched_tag"] = stream_slice["tag"] | ||
|
||
if t.get('author_id') and t['author_id'] in users_map: | ||
user_info = users_map[t['author_id']] | ||
t["author_username"] = user_info.get('username') | ||
t["author_name"] = user_info.get('name') | ||
t["author_verified"] = user_info.get('verified') | ||
|
||
yield t | ||
self._apply_rate_limiting() | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why did you add a whitespace at the end ?