From 51f54611f7f26ef7b60ba6bfc0a294c9e5895e9c Mon Sep 17 00:00:00 2001 From: Arkadiy Shapkin Date: Wed, 19 Feb 2020 20:00:43 +0300 Subject: [PATCH 1/3] Added Skype support --- README.md | 22 +++++++- config.yml | 3 + parse.py | 10 ++++ parsers/skype.py | 140 +++++++++++++++++++++++++++++++++++++++++++++++ utils.py | 2 +- 5 files changed, 173 insertions(+), 4 deletions(-) create mode 100644 parsers/skype.py diff --git a/README.md b/README.md index 863b5c4..0062a0b 100644 --- a/README.md +++ b/README.md @@ -20,12 +20,13 @@ Can also generate histograms and word clouds from the chat logs. ### Support Matrix -| Platform | Direct Chat | Group Chat | -|:------------------:|:-----------: |:----------:| +| Platform | Direct Chat | Group Chat | +|:------------------:|:------------:|:-----------:| | Facebook Messenger | ✔ | ✘ | | Google Hangouts | ✔ | ✘ | | Telegram | ✔ | ✘ | | WhatsApp | ✔ | ✔ | +| Skype | ✔ | ✔ | ### Exported data @@ -80,6 +81,18 @@ Unfortunately, WhatsApp only lets you export your conversations **from your phon The Telegram API works differently: you will first need to setup Chatistics, then query your chat logs programmatically. This process is documented below. Exporting Telegram chat logs is very fast. +### Skype + +**Warning:** Skype archives can take a long time to be ready for download - up to one hour in our experience. + +1. Login to your Skype account: https://go.skype.com/export +2. Select the option to download your Conversations, and then select Submit request +3. When your request is complete, you'll receive a notification in Skype with a link to view or download your file. + If you don't receive a notification in Skype, check the [export page](http://go.skype.com/export). + A link to download your files will also appear there once they are available to download. +4. Click the Download button to download your files +5. Extract the file called `messages.json` to `./raw_data/skype/` + ## 2. Setup Chatistics First, install the required Python packages using conda: @@ -102,6 +115,9 @@ python parse.py messenger # WhatsApp python parse.py whatsapp + +# Skype +python parse.py skype ``` ### Telegram @@ -144,7 +160,7 @@ Among other options you can filter messages as needed (also see `python visualiz ``` --platforms {telegram,whatsapp,messenger,hangouts} - Use data only from certain platforms (default: ['telegram', 'whatsapp', 'messenger', 'hangouts']) + Use data only from certain platforms (default: ['telegram', 'whatsapp', 'messenger', 'hangouts', 'skype']) --filter-conversation Limit by conversations with this person/group (default: []) --filter-sender diff --git a/config.yml b/config.yml index 2332bed..f77e000 100644 --- a/config.yml +++ b/config.yml @@ -21,3 +21,6 @@ telegram: whatsapp: DEFAULT_RAW_LOCATION: 'raw_data/whatsapp' OUTPUT_PICKLE_NAME: 'whatsapp.pkl' +skype: + DEFAULT_RAW_LOCATION: 'raw_data/skype/messages.json' + OUTPUT_PICKLE_NAME: 'skype.pkl' diff --git a/parse.py b/parse.py index 45bc623..401baeb 100644 --- a/parse.py +++ b/parse.py @@ -12,6 +12,7 @@ hangouts Parse logs from hangouts messenger Parse logs from messenger whatsapp Parse logs from whatsapp + skype Parse logs from skype """ @@ -79,6 +80,15 @@ def whatsapp(self): args = parser.parse_args(sys.argv[2:]) main(args.own_name, args.file_path, args.max, args.infer_datetime) + def skype(self): + from parsers.skype import main + parser = ArgParseDefault(description='Parse message logs from Skype') + parser = add_common_parse_arguments(parser) + parser.add_argument('-f', '--file-path', dest='file_path', default=config['skype']['DEFAULT_RAW_LOCATION'], + help='Path to Skype chat log file (json file)') + args = parser.parse_args(sys.argv[2:]) + main(args.own_name, args.file_path, args.max) + if __name__ == '__main__': ArgParse() diff --git a/parsers/skype.py b/parsers/skype.py new file mode 100644 index 0000000..2753052 --- /dev/null +++ b/parsers/skype.py @@ -0,0 +1,140 @@ +from parsers.config import config +from parsers.utils import export_dataframe, detect_language +import json +import pandas as pd +import logging +from bs4 import BeautifulSoup +from collections import defaultdict +from dateutil.parser import parse +import os +import html +import warnings + +log = logging.getLogger(__name__) + +warnings.filterwarnings("ignore", category=UserWarning, module='bs4') + +def main(own_name, file_path, max_exported_messages): + global MAX_EXPORTED_MESSAGES + MAX_EXPORTED_MESSAGES = max_exported_messages + log.info('Parsing Skype data...') + if not os.path.isfile(file_path): + log.error(f'No input file under {file_path}') + exit(0) + archive = read_archive(file_path) + own_id = archive["userId"] + if own_name is None: + ind = own_id.rfind(":") + own_name = own_id[ind+1:] + data = parse_messages(archive, own_id, own_name) + log.info('{:,} messages parsed.'.format(len(data))) + if len(data) < 1: + log.info('Nothing to save.') + exit(0) + log.info('Converting to DataFrame...') + df = pd.DataFrame(data, columns=config['ALL_COLUMNS']) + df['platform'] = 'skype' + log.info('Detecting languages...') + df = detect_language(df) + export_dataframe(df, config['skype']['OUTPUT_PICKLE_NAME']) + log.info('Done.') + + +def parse_messages(archive, own_id, own_name): + def id_to_name(_id): + if _id in names: + return names[_id] + else: + return None + + def save_name_for_id(name, _id): + if not _id in names: + names[_id] = name + elif names[_id] != name: + log.info(f'Assuming {name} is {names[_id]}') + + names = {} + data = [] + log.info('Extracting names...') + for conversation in archive["conversations"]: + if conversation["threadProperties"]: + for message in conversation["MessageList"]: + sender_id = message["from"] + sender_name = message["displayName"] + if sender_name: + sender_name = html.unescape(sender_name) + save_name_for_id(sender_name, sender_id) + else: + conversation_with_id = conversation["id"] + conversation_with_name = conversation["displayName"] + if conversation_with_name: + conversation_with_name = html.unescape(conversation_with_name) + save_name_for_id(conversation_with_name, conversation_with_id) + + save_name_for_id(own_name, own_id) + + log.info('Extracting messages...') + for conversation in archive["conversations"]: + conversation_with_id = conversation["id"] + conversation_with_name = conversation["displayName"] + if conversation_with_name: + conversation_with_name = html.unescape(conversation_with_name) + for message in conversation["MessageList"]: + message_type = message["messagetype"] + timestamp = parse(message["originalarrivaltime"]).timestamp() + content = message["content"] + sender_id = message["from"] + sender_name = message["displayName"] + if sender_name: + sender_name = html.unescape(sender_name) + outgoing = sender_id == own_id + + if message_type == "RichText": + if not sender_name: + sender_name = id_to_name(sender_id) + if sender_name is None: + # unknown sender + log.error(f"No senderName could be found for senderId ({sender_id})") + ind = sender_id.rfind(":") + sender_name = sender_id[ind+1:] + save_name_for_id(sender_name, sender_id) + + soup = BeautifulSoup(content, "html.parser") + + # remove quotes + for script in soup(["quote", "legacyquote"]): + script.extract() + + content = soup.get_text() + + # saves the message + data += [[timestamp, conversation_with_id, conversation_with_name, sender_name, outgoing, content, '', '']] + + if len(data) >= MAX_EXPORTED_MESSAGES: + log.warning(f'Reached max exported messages limit of {MAX_EXPORTED_MESSAGES}. Increase limit in order to parse all messages.') + return data + elif message_type == "Text": + if not sender_name: + sender_name = id_to_name(sender_id) + if sender_name is None: + # unknown sender + log.error(f"No senderName could be found for senderId ({sender_id})") + ind = sender_id.rfind(":") + sender_name = sender_id[ind+1:] + save_name_for_id(sender_name, sender_id) + + # saves the message + data += [[timestamp, conversation_with_id, conversation_with_name, sender_name, outgoing, content, '', '']] + + if len(data) >= MAX_EXPORTED_MESSAGES: + log.warning(f'Reached max exported messages limit of {MAX_EXPORTED_MESSAGES}. Increase limit in order to parse all messages.') + return data + return data + + +def read_archive(file_path): + log.info(f'Reading archive file {file_path}...') + with open(file_path, encoding='utf-8') as f: + archive = json.loads(f.read()) + return archive + diff --git a/utils.py b/utils.py index 50659a6..a63a2a8 100644 --- a/utils.py +++ b/utils.py @@ -16,7 +16,7 @@ def __init__(self, **kwargs): def add_load_data_args(parser): """Adds common data loader arguments to arg parser""" - platforms = ['telegram', 'whatsapp', 'messenger', 'hangouts'] + platforms = ['telegram', 'whatsapp', 'messenger', 'hangouts', 'skype'] parser.add_argument('-p', '--platforms', default=platforms, choices=platforms, nargs='+', help='Use data only from certain platforms') parser.add_argument('--filter-conversation', dest='filter_conversation', nargs='+', default=[], help='Limit by conversations with this person/group') parser.add_argument('--filter-sender', dest='filter_sender', nargs='+', default=[], help='Limit by messages by this sender') From b2881466c55ac2910be170270f3920e6b6fd54c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20M=C3=BCller?= Date: Wed, 19 Feb 2020 23:42:57 +0100 Subject: [PATCH 2/3] add beautifulsoup4 to environment, add gitignore to raw_data/skype, minor improvements to parsing --- environment.yml | 1 + parsers/skype.py | 11 +++++++---- parsers/utils.py | 6 +++++- raw_data/skype/.gitignore | 4 ++++ 4 files changed, 17 insertions(+), 5 deletions(-) create mode 100644 raw_data/skype/.gitignore diff --git a/environment.yml b/environment.yml index bb87472..f35d779 100644 --- a/environment.yml +++ b/environment.yml @@ -11,5 +11,6 @@ dependencies: - pip==19.3.1 - seaborn==0.9.0 - wordcloud==1.6.0 + - beautifulsoup4==4.8.2 - pip: - telethon==1.10.9 diff --git a/parsers/skype.py b/parsers/skype.py index 2753052..c886e9e 100644 --- a/parsers/skype.py +++ b/parsers/skype.py @@ -79,6 +79,9 @@ def save_name_for_id(name, _id): conversation_with_name = conversation["displayName"] if conversation_with_name: conversation_with_name = html.unescape(conversation_with_name) + else: + # If conversation_with_name is None we are collecting caller log files -> skip + continue for message in conversation["MessageList"]: message_type = message["messagetype"] timestamp = parse(message["originalarrivaltime"]).timestamp() @@ -98,18 +101,18 @@ def save_name_for_id(name, _id): ind = sender_id.rfind(":") sender_name = sender_id[ind+1:] save_name_for_id(sender_name, sender_id) - + soup = BeautifulSoup(content, "html.parser") # remove quotes for script in soup(["quote", "legacyquote"]): - script.extract() + script.extract() content = soup.get_text() # saves the message data += [[timestamp, conversation_with_id, conversation_with_name, sender_name, outgoing, content, '', '']] - + if len(data) >= MAX_EXPORTED_MESSAGES: log.warning(f'Reached max exported messages limit of {MAX_EXPORTED_MESSAGES}. Increase limit in order to parse all messages.') return data @@ -122,7 +125,7 @@ def save_name_for_id(name, _id): ind = sender_id.rfind(":") sender_name = sender_id[ind+1:] save_name_for_id(sender_name, sender_id) - + # saves the message data += [[timestamp, conversation_with_id, conversation_with_name, sender_name, outgoing, content, '', '']] diff --git a/parsers/utils.py b/parsers/utils.py index 15d1309..ace0cf0 100755 --- a/parsers/utils.py +++ b/parsers/utils.py @@ -2,6 +2,7 @@ import datetime import logging import langdetect +from langdetect.lang_detect_exception import LangDetectException log = logging.getLogger(__name__) @@ -21,7 +22,10 @@ def detect_language(df, min_token_count=5): for name, group in df.groupby(df.conversationWithName): text = ' '.join(group['text'].dropna().values[:100]) if len(text.split()) >= min_token_count: - lang = langdetect.detect(text) + try: + lang = langdetect.detect(text) + except LangDetectException: + lang = 'unknown' else: lang = 'unknown' df.loc[group.index, 'language'] = lang diff --git a/raw_data/skype/.gitignore b/raw_data/skype/.gitignore new file mode 100644 index 0000000..5e7d273 --- /dev/null +++ b/raw_data/skype/.gitignore @@ -0,0 +1,4 @@ +# Ignore everything in this directory +* +# Except this file +!.gitignore From 24f8c77ba511da4f472144b94be5dea32af90530 Mon Sep 17 00:00:00 2001 From: Arkady Shapkin Date: Sat, 25 Apr 2020 23:02:16 +0300 Subject: [PATCH 3/3] Apply suggestions from code review Co-Authored-By: Willem Handreck --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 0062a0b..23ecbcc 100644 --- a/README.md +++ b/README.md @@ -159,7 +159,7 @@ Plot all messages with: Among other options you can filter messages as needed (also see `python visualize.py breakdown --help`): ``` - --platforms {telegram,whatsapp,messenger,hangouts} + --platforms {telegram,whatsapp,messenger,hangouts,skype} Use data only from certain platforms (default: ['telegram', 'whatsapp', 'messenger', 'hangouts', 'skype']) --filter-conversation Limit by conversations with this person/group (default: [])