Skip to content

Commit 068cfd8

Browse files
committed
twitter : new stream to extract tweets tagging us
1 parent 6473ce1 commit 068cfd8

File tree

8 files changed

+173
-11
lines changed

8 files changed

+173
-11
lines changed

source-twitter-fetcher/Dockerfile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
FROM airbyte/python-connector-base:1.1.0@sha256:dd17e347fbda94f7c3abff539be298a65af2d7fc27a307d89297df1081a45c27
2-
1+
#FROM airbyte/python-connector-base:1.1.0@sha256:dd17e347fbda94f7c3abff539be298a65af2d7fc27a307d89297df1081a45c27
2+
FROM --platform=linux/amd64 airbyte/python-connector-base:1.1.0
33
COPY . ./airbyte/integration_code
44
RUN pip install ./airbyte/integration_code
55

source-twitter-fetcher/metadata.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ data:
1111
connectorType: source
1212
definitionId: 1c448bfb-8950-478c-9ae0-f03aaaf4e920
1313
dockerImageTag: '1.0.0'
14-
dockerRepository: harbor.status.im/bi/airbyte/source-twitter-fetcher
14+
dockerRepository: harbor.status.im/bi/airbyte/source-twitter-fetcher
1515
githubIssueLabel: source-twitter-fetcher
1616
icon: twitter-fetcher.svg
1717
license: MIT
Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
{
22
"credentials":{
3-
"client_id": "some-id",
4-
"client_secret": "some-secret",
5-
"access_token": "some-access-token",
6-
"refresh_token": "some-refresh-token",
7-
"token_expiry_date": ""
3+
"client_id": "your_client_id",
4+
"client_secret": "your_client_secret",
5+
"access_token": "your_access_token",
6+
"refresh_token": "your_refresh_token",
7+
"token_expiry_date": "2024-12-31T00:00:00Z"
88
},
9-
"account_id": "123456789",
10-
"start_time": "2024-01-01T00:00:00Z"
9+
"account_id": "your_account_id",
10+
"start_time": "2024-01-01T00:00:00Z",
11+
"tags": ["@IFT", "@status", "@Airbyte"]
1112
}

source-twitter-fetcher/sample_files/configured_catalog.json

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,17 @@
111111
},
112112
"sync_mode": "incremental",
113113
"destination_sync_mode": "overwrite"
114+
},
115+
{
116+
"stream": {
117+
"name": "tags",
118+
"json_schema": {},
119+
"supported_sync_modes": ["full_refresh"],
120+
"source_defined_cursor": false,
121+
"default_cursor_field": []
122+
},
123+
"sync_mode": "full_refresh",
124+
"destination_sync_mode": "overwrite"
114125
}
115126
]
116127
}
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
{
2+
"$schema": "http://json-schema.org/draft-07/schema#",
3+
"type": "object",
4+
"properties": {
5+
"id": {
6+
"type": ["null", "string"]
7+
},
8+
"text": {
9+
"type": ["null", "string"]
10+
},
11+
"created_at": {
12+
"type": ["null", "string"]
13+
},
14+
"author_id": {
15+
"type": ["null", "string"]
16+
},
17+
"conversation_id": {
18+
"type": ["null", "string"]
19+
},
20+
"reply_settings": {
21+
"type": ["null", "string"]
22+
},
23+
"matched_tag": {
24+
"type": ["null", "string"],
25+
"description": "The tag that matched this tweet (e.g., '@IFT' or '@status')"
26+
},
27+
"referenced_tweets": {
28+
"type": ["null", "array"],
29+
"items": {
30+
"type": ["object"],
31+
"properties":{
32+
"type": {
33+
"type": ["null", "string"]
34+
},
35+
"id": {
36+
"type": ["null", "string"]
37+
}
38+
}
39+
}
40+
},
41+
"public_metrics": {
42+
"type": ["null", "object"],
43+
"properties": {
44+
"retweet_count": {
45+
"type": ["null", "number"]
46+
},
47+
"reply_count": {
48+
"type": ["null", "number"]
49+
},
50+
"like_count": {
51+
"type": ["null", "number"]
52+
},
53+
"quote_count": {
54+
"type": ["null", "number"]
55+
},
56+
"impression_count": {
57+
"type": ["null", "number"]
58+
},
59+
"bookmark_count": {
60+
"type": ["null", "number"]
61+
}
62+
}
63+
}
64+
}
65+
}

source-twitter-fetcher/source_twitter_fetcher/source.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from .tweets_stream import Account, Tweet, TweetMetrics, TweetPromoted
88
from .ads_stream import PromotedTweetActive, PromotedTweetBilling, PromotedTweetEngagement
99
from .spaces_stream import Space
10+
from .tags_stream import TagsStream
1011
from .auth import TwitterOAuth
1112

1213
DATE_FORMAT = "%Y-%m-%dT%H:%M:%SZ"
@@ -27,6 +28,13 @@ def streams(self, config: Mapping[str, Any]) -> List[Stream]:
2728
start_time=datetime.strptime(config['start_time'], DATE_FORMAT),
2829
)
2930

31+
tags = TagsStream(
32+
authenticator=auth,
33+
account_id=config["account_id"],
34+
start_time=datetime.strptime(config['start_time'], DATE_FORMAT),
35+
tags=config["tags"]
36+
)
37+
3038
tweet_metrics = TweetMetrics(
3139
authenticator=auth,
3240
account_id=config['account_id'],
@@ -71,5 +79,6 @@ def streams(self, config: Mapping[str, Any]) -> List[Stream]:
7179
promoted_tweet_active,
7280
promoted_tweet_billing,
7381
promoted_tweet_engagement,
74-
space
82+
space,
83+
tags
7584
]

source-twitter-fetcher/source_twitter_fetcher/spec.yaml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ connectionSpecification:
77
- credentials
88
- account_id
99
- start_time
10+
- tags
1011
properties:
1112
credentials:
1213
title: Twitter Dev account Credentials
@@ -44,3 +45,10 @@ connectionSpecification:
4445
type: string
4546
description: "Start date of fetching data"
4647
format: datetime
48+
tags:
49+
type: array
50+
title: "Tags to Monitor"
51+
description: "List of Twitter handles to monitor (e.g., ['@IFT', '@Airbyte'])"
52+
items:
53+
type: string
54+
minItems: 1
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
from typing import Any, Iterable, Mapping, MutableMapping, Optional, List
2+
import logging
3+
import requests
4+
import time
5+
from datetime import datetime
6+
from airbyte_cdk.sources.streams import Stream
7+
from airbyte_cdk.sources.streams.http import HttpStream
8+
9+
logger = logging.getLogger("airbyte")
10+
11+
class TagsStream(HttpStream):
12+
url_base = "https://api.x.com/2/"
13+
primary_key = "id"
14+
15+
def __init__(self, start_time: str = None, account_id: str = None, tags: List[str] = None, **kwargs):
16+
super().__init__(**kwargs)
17+
self.start_time = start_time
18+
self.account_id = account_id
19+
self.tags = tags or []
20+
21+
def stream_slices(self, **kwargs) -> Iterable[Optional[Mapping[str, Any]]]:
22+
for tag in self.tags:
23+
yield {"tag": tag}
24+
25+
def path(
26+
self,
27+
stream_state: Mapping[str, Any] = None,
28+
stream_slice: Mapping[str, Any] = None,
29+
next_page_token: Mapping[str, Any] = None
30+
) -> str:
31+
return "tweets/search/recent" # this endpoint fetches data from the last 7 days
32+
33+
def next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]:
34+
if 'meta' in response.json() and 'next_token' in response.json()['meta'] and response.json()['meta']['result_count'] > 0:
35+
logger.debug('DBG-NT: %s', response.json()['meta']['next_token'])
36+
return {"next_token": response.json()['meta']['next_token']}
37+
38+
def request_params(
39+
self,
40+
next_page_token: Optional[Mapping[str, Any]] = None,
41+
stream_state: Mapping[str, Any] = None,
42+
stream_slice: Mapping[str, Any] = None
43+
) -> MutableMapping[str, Any]:
44+
tag = stream_slice["tag"]
45+
params = {
46+
"query": tag,
47+
"tweet.fields": "text,public_metrics,author_id,referenced_tweets,created_at",
48+
"max_results": 100
49+
}
50+
params.update({"start_time": self.start_time.strftime("%Y-%m-%dT%H:%M:%SZ")})
51+
if next_page_token:
52+
params.update(**next_page_token)
53+
return params
54+
55+
def parse_response(
56+
self,
57+
response: requests.Response,
58+
stream_slice: Mapping[str, Any] = None,
59+
**kwargs
60+
) -> Iterable[Mapping]:
61+
logger.debug("Full response %s", response.json())
62+
if 'data' in response.json():
63+
data = response.json()['data']
64+
for t in data:
65+
# Add the tag that matched this tweet
66+
t["matched_tag"] = stream_slice["tag"]
67+
yield t
68+
time.sleep(2) # Rate limiting protection

0 commit comments

Comments
 (0)