twitter : new stream to extract tweets tagging us

c200bzh · c200bzh · commit 068cfd8cc538 · 2025-05-27T17:14:42.000+01:00
diff --git a/source-twitter-fetcher/Dockerfile b/source-twitter-fetcher/Dockerfile
@@ -1,5 +1,5 @@
-FROM airbyte/python-connector-base:1.1.0@sha256:dd17e347fbda94f7c3abff539be298a65af2d7fc27a307d89297df1081a45c27
-
+#FROM airbyte/python-connector-base:1.1.0@sha256:dd17e347fbda94f7c3abff539be298a65af2d7fc27a307d89297df1081a45c27
+FROM --platform=linux/amd64 airbyte/python-connector-base:1.1.0
 COPY . ./airbyte/integration_code
 RUN pip install ./airbyte/integration_code
 
diff --git a/source-twitter-fetcher/metadata.yaml b/source-twitter-fetcher/metadata.yaml
@@ -11,7 +11,7 @@ data:
   connectorType: source
   definitionId: 1c448bfb-8950-478c-9ae0-f03aaaf4e920
   dockerImageTag: '1.0.0'
-  dockerRepository: harbor.status.im/bi/airbyte/source-twitter-fetcher
+  dockerRepository: harbor.status.im/bi/airbyte/source-twitter-fetcher 
   githubIssueLabel: source-twitter-fetcher
   icon: twitter-fetcher.svg
   license: MIT
diff --git a/source-twitter-fetcher/sample_files/config-example.json b/source-twitter-fetcher/sample_files/config-example.json
@@ -1,11 +1,12 @@
 {
   "credentials":{
-    "client_id": "some-id",
-    "client_secret": "some-secret",
-    "access_token": "some-access-token",
-    "refresh_token": "some-refresh-token",
-    "token_expiry_date": ""
+    "client_id": "your_client_id",
+    "client_secret": "your_client_secret",
+    "access_token": "your_access_token",
+    "refresh_token": "your_refresh_token",
+    "token_expiry_date": "2024-12-31T00:00:00Z"
   },
-  "account_id": "123456789",
-  "start_time": "2024-01-01T00:00:00Z"
+  "account_id": "your_account_id",
+  "start_time": "2024-01-01T00:00:00Z",
+  "tags": ["@IFT", "@status", "@Airbyte"]
 }
diff --git a/source-twitter-fetcher/sample_files/configured_catalog.json b/source-twitter-fetcher/sample_files/configured_catalog.json
@@ -111,6 +111,17 @@
       },
       "sync_mode": "incremental",
       "destination_sync_mode": "overwrite"
+    },
+    {
+      "stream": {
+        "name": "tags",
+        "json_schema": {},
+        "supported_sync_modes": ["full_refresh"],
+        "source_defined_cursor": false,
+        "default_cursor_field": []
+      },
+      "sync_mode": "full_refresh",
+      "destination_sync_mode": "overwrite"
     }
   ]
 }
diff --git a/source-twitter-fetcher/source_twitter_fetcher/schemas/tags_stream.json b/source-twitter-fetcher/source_twitter_fetcher/schemas/tags_stream.json
@@ -0,0 +1,65 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "type": "object",
+  "properties": {
+    "id": {
+      "type": ["null", "string"]
+    },
+    "text": {
+      "type": ["null", "string"]
+    },
+    "created_at": {
+      "type": ["null", "string"]
+    },
+    "author_id": {
+      "type": ["null", "string"]
+    },
+    "conversation_id": {
+      "type": ["null", "string"]
+    },
+    "reply_settings": {
+      "type": ["null", "string"]
+    },
+    "matched_tag": {
+      "type": ["null", "string"],
+      "description": "The tag that matched this tweet (e.g., '@IFT' or '@status')"
+    },
+    "referenced_tweets": {
+      "type": ["null", "array"],
+      "items": {
+        "type": ["object"],
+        "properties":{
+          "type": {
+            "type": ["null", "string"]
+          },
+          "id": {
+            "type": ["null", "string"]
+          }
+        }
+      }
+    },
+    "public_metrics": {
+      "type": ["null", "object"],
+      "properties": {
+        "retweet_count": {
+          "type": ["null", "number"]
+        },
+        "reply_count": {
+          "type": ["null", "number"]
+        },
+        "like_count": {
+          "type": ["null", "number"]
+        },
+        "quote_count": {
+          "type": ["null", "number"]
+        },
+        "impression_count": {
+           "type": ["null", "number"]
+        },
+        "bookmark_count": {
+          "type": ["null", "number"]
+        }
+      }
+    }
+  }
+} 
diff --git a/source-twitter-fetcher/source_twitter_fetcher/source.py b/source-twitter-fetcher/source_twitter_fetcher/source.py
@@ -7,6 +7,7 @@
 from .tweets_stream import Account, Tweet, TweetMetrics, TweetPromoted
 from .ads_stream import PromotedTweetActive, PromotedTweetBilling, PromotedTweetEngagement
 from .spaces_stream import Space
+from .tags_stream import TagsStream
 from .auth import TwitterOAuth
 
 DATE_FORMAT = "%Y-%m-%dT%H:%M:%SZ"
@@ -27,6 +28,13 @@ def streams(self, config: Mapping[str, Any]) -> List[Stream]:
             start_time=datetime.strptime(config['start_time'], DATE_FORMAT),
         )
 
+        tags = TagsStream(
+            authenticator=auth,
+            account_id=config["account_id"],
+            start_time=datetime.strptime(config['start_time'], DATE_FORMAT),
+            tags=config["tags"]
+        )
+
         tweet_metrics = TweetMetrics(
             authenticator=auth,
             account_id=config['account_id'],
@@ -71,5 +79,6 @@ def streams(self, config: Mapping[str, Any]) -> List[Stream]:
             promoted_tweet_active,
             promoted_tweet_billing,
             promoted_tweet_engagement,
-            space
+            space,
+            tags
         ]
diff --git a/source-twitter-fetcher/source_twitter_fetcher/spec.yaml b/source-twitter-fetcher/source_twitter_fetcher/spec.yaml
@@ -7,6 +7,7 @@ connectionSpecification:
     - credentials
     - account_id
     - start_time
+    - tags
   properties:
     credentials:
       title: Twitter Dev account Credentials
@@ -44,3 +45,10 @@ connectionSpecification:
       type:         string
       description:  "Start date of fetching data"
       format:       datetime
+    tags:
+      type: array
+      title: "Tags to Monitor"
+      description: "List of Twitter handles to monitor (e.g., ['@IFT', '@Airbyte'])"
+      items:
+        type: string
+      minItems: 1
diff --git a/source-twitter-fetcher/source_twitter_fetcher/tags_stream.py b/source-twitter-fetcher/source_twitter_fetcher/tags_stream.py
@@ -0,0 +1,68 @@
+from typing import Any, Iterable, Mapping, MutableMapping, Optional, List
+import logging
+import requests
+import time
+from datetime import datetime
+from airbyte_cdk.sources.streams import Stream
+from airbyte_cdk.sources.streams.http import HttpStream
+
+logger = logging.getLogger("airbyte")
+
+class TagsStream(HttpStream):
+    url_base = "https://api.x.com/2/"
+    primary_key = "id"
+
+    def __init__(self, start_time: str = None, account_id: str = None, tags: List[str] = None, **kwargs):
+        super().__init__(**kwargs)
+        self.start_time = start_time
+        self.account_id = account_id
+        self.tags = tags or []
+
+    def stream_slices(self, **kwargs) -> Iterable[Optional[Mapping[str, Any]]]:
+        for tag in self.tags:
+            yield {"tag": tag}
+
+    def path(
+        self,
+        stream_state: Mapping[str, Any] = None,
+        stream_slice: Mapping[str, Any] = None,
+        next_page_token: Mapping[str, Any] = None
+    ) -> str:
+        return "tweets/search/recent" # this endpoint fetches data from the last 7 days 
+
+    def next_page_token(self, response: requests.Response) -> Optional[Mapping[str, Any]]:
+        if 'meta' in response.json() and 'next_token' in response.json()['meta'] and response.json()['meta']['result_count'] > 0:
+            logger.debug('DBG-NT: %s', response.json()['meta']['next_token'])
+            return {"next_token": response.json()['meta']['next_token']}
+
+    def request_params(
+        self,
+        next_page_token: Optional[Mapping[str, Any]] = None,
+        stream_state: Mapping[str, Any] = None,
+        stream_slice: Mapping[str, Any] = None
+    ) -> MutableMapping[str, Any]:
+        tag = stream_slice["tag"]
+        params = {
+            "query": tag,
+            "tweet.fields": "text,public_metrics,author_id,referenced_tweets,created_at",
+            "max_results": 100
+        }
+        params.update({"start_time": self.start_time.strftime("%Y-%m-%dT%H:%M:%SZ")})
+        if next_page_token:
+            params.update(**next_page_token)
+        return params
+
+    def parse_response(
+        self,
+        response: requests.Response,
+        stream_slice: Mapping[str, Any] = None,
+        **kwargs
+    ) -> Iterable[Mapping]:
+        logger.debug("Full response %s", response.json())
+        if 'data' in response.json():
+            data = response.json()['data']
+            for t in data:
+                # Add the tag that matched this tweet
+                t["matched_tag"] = stream_slice["tag"]
+                yield t
+        time.sleep(2)  # Rate limiting protection 

Original file line number	Diff line number	Diff line change
`@@ -111,6 +111,17 @@`
`111`	`111`	`},`
`112`	`112`	`"sync_mode": "incremental",`
`113`	`113`	`"destination_sync_mode": "overwrite"`
	`114`	`+ },`
	`115`	`+ {`
	`116`	`+ "stream": {`
	`117`	`+ "name": "tags",`
	`118`	`+ "json_schema": {},`
	`119`	`+ "supported_sync_modes": ["full_refresh"],`
	`120`	`+ "source_defined_cursor": false,`
	`121`	`+ "default_cursor_field": []`
	`122`	`+ },`
	`123`	`+ "sync_mode": "full_refresh",`
	`124`	`+ "destination_sync_mode": "overwrite"`
`114`	`125`	`}`
`115`	`126`	`]`
`116`	`127`	`}`