diff --git a/README.md b/README.md index 863b5c4..23ecbcc 100644 --- a/README.md +++ b/README.md @@ -20,12 +20,13 @@ Can also generate histograms and word clouds from the chat logs. ### Support Matrix -| Platform | Direct Chat | Group Chat | -|:------------------:|:-----------: |:----------:| +| Platform | Direct Chat | Group Chat | +|:------------------:|:------------:|:-----------:| | Facebook Messenger | ✔ | ✘ | | Google Hangouts | ✔ | ✘ | | Telegram | ✔ | ✘ | | WhatsApp | ✔ | ✔ | +| Skype | ✔ | ✔ | ### Exported data @@ -80,6 +81,18 @@ Unfortunately, WhatsApp only lets you export your conversations **from your phon The Telegram API works differently: you will first need to setup Chatistics, then query your chat logs programmatically. This process is documented below. Exporting Telegram chat logs is very fast. +### Skype + +**Warning:** Skype archives can take a long time to be ready for download - up to one hour in our experience. + +1. Login to your Skype account: https://go.skype.com/export +2. Select the option to download your Conversations, and then select Submit request +3. When your request is complete, you'll receive a notification in Skype with a link to view or download your file. + If you don't receive a notification in Skype, check the [export page](http://go.skype.com/export). + A link to download your files will also appear there once they are available to download. +4. Click the Download button to download your files +5. Extract the file called `messages.json` to `./raw_data/skype/` + ## 2. Setup Chatistics First, install the required Python packages using conda: @@ -102,6 +115,9 @@ python parse.py messenger # WhatsApp python parse.py whatsapp + +# Skype +python parse.py skype ``` ### Telegram @@ -143,8 +159,8 @@ Plot all messages with: Among other options you can filter messages as needed (also see `python visualize.py breakdown --help`): ``` - --platforms {telegram,whatsapp,messenger,hangouts} - Use data only from certain platforms (default: ['telegram', 'whatsapp', 'messenger', 'hangouts']) + --platforms {telegram,whatsapp,messenger,hangouts,skype} + Use data only from certain platforms (default: ['telegram', 'whatsapp', 'messenger', 'hangouts', 'skype']) --filter-conversation Limit by conversations with this person/group (default: []) --filter-sender diff --git a/config.yml b/config.yml index 2332bed..f77e000 100644 --- a/config.yml +++ b/config.yml @@ -21,3 +21,6 @@ telegram: whatsapp: DEFAULT_RAW_LOCATION: 'raw_data/whatsapp' OUTPUT_PICKLE_NAME: 'whatsapp.pkl' +skype: + DEFAULT_RAW_LOCATION: 'raw_data/skype/messages.json' + OUTPUT_PICKLE_NAME: 'skype.pkl' diff --git a/environment.yml b/environment.yml index bb87472..f35d779 100644 --- a/environment.yml +++ b/environment.yml @@ -11,5 +11,6 @@ dependencies: - pip==19.3.1 - seaborn==0.9.0 - wordcloud==1.6.0 + - beautifulsoup4==4.8.2 - pip: - telethon==1.10.9 diff --git a/parse.py b/parse.py index 45bc623..401baeb 100644 --- a/parse.py +++ b/parse.py @@ -12,6 +12,7 @@ hangouts Parse logs from hangouts messenger Parse logs from messenger whatsapp Parse logs from whatsapp + skype Parse logs from skype """ @@ -79,6 +80,15 @@ def whatsapp(self): args = parser.parse_args(sys.argv[2:]) main(args.own_name, args.file_path, args.max, args.infer_datetime) + def skype(self): + from parsers.skype import main + parser = ArgParseDefault(description='Parse message logs from Skype') + parser = add_common_parse_arguments(parser) + parser.add_argument('-f', '--file-path', dest='file_path', default=config['skype']['DEFAULT_RAW_LOCATION'], + help='Path to Skype chat log file (json file)') + args = parser.parse_args(sys.argv[2:]) + main(args.own_name, args.file_path, args.max) + if __name__ == '__main__': ArgParse() diff --git a/parsers/skype.py b/parsers/skype.py new file mode 100644 index 0000000..c886e9e --- /dev/null +++ b/parsers/skype.py @@ -0,0 +1,143 @@ +from parsers.config import config +from parsers.utils import export_dataframe, detect_language +import json +import pandas as pd +import logging +from bs4 import BeautifulSoup +from collections import defaultdict +from dateutil.parser import parse +import os +import html +import warnings + +log = logging.getLogger(__name__) + +warnings.filterwarnings("ignore", category=UserWarning, module='bs4') + +def main(own_name, file_path, max_exported_messages): + global MAX_EXPORTED_MESSAGES + MAX_EXPORTED_MESSAGES = max_exported_messages + log.info('Parsing Skype data...') + if not os.path.isfile(file_path): + log.error(f'No input file under {file_path}') + exit(0) + archive = read_archive(file_path) + own_id = archive["userId"] + if own_name is None: + ind = own_id.rfind(":") + own_name = own_id[ind+1:] + data = parse_messages(archive, own_id, own_name) + log.info('{:,} messages parsed.'.format(len(data))) + if len(data) < 1: + log.info('Nothing to save.') + exit(0) + log.info('Converting to DataFrame...') + df = pd.DataFrame(data, columns=config['ALL_COLUMNS']) + df['platform'] = 'skype' + log.info('Detecting languages...') + df = detect_language(df) + export_dataframe(df, config['skype']['OUTPUT_PICKLE_NAME']) + log.info('Done.') + + +def parse_messages(archive, own_id, own_name): + def id_to_name(_id): + if _id in names: + return names[_id] + else: + return None + + def save_name_for_id(name, _id): + if not _id in names: + names[_id] = name + elif names[_id] != name: + log.info(f'Assuming {name} is {names[_id]}') + + names = {} + data = [] + log.info('Extracting names...') + for conversation in archive["conversations"]: + if conversation["threadProperties"]: + for message in conversation["MessageList"]: + sender_id = message["from"] + sender_name = message["displayName"] + if sender_name: + sender_name = html.unescape(sender_name) + save_name_for_id(sender_name, sender_id) + else: + conversation_with_id = conversation["id"] + conversation_with_name = conversation["displayName"] + if conversation_with_name: + conversation_with_name = html.unescape(conversation_with_name) + save_name_for_id(conversation_with_name, conversation_with_id) + + save_name_for_id(own_name, own_id) + + log.info('Extracting messages...') + for conversation in archive["conversations"]: + conversation_with_id = conversation["id"] + conversation_with_name = conversation["displayName"] + if conversation_with_name: + conversation_with_name = html.unescape(conversation_with_name) + else: + # If conversation_with_name is None we are collecting caller log files -> skip + continue + for message in conversation["MessageList"]: + message_type = message["messagetype"] + timestamp = parse(message["originalarrivaltime"]).timestamp() + content = message["content"] + sender_id = message["from"] + sender_name = message["displayName"] + if sender_name: + sender_name = html.unescape(sender_name) + outgoing = sender_id == own_id + + if message_type == "RichText": + if not sender_name: + sender_name = id_to_name(sender_id) + if sender_name is None: + # unknown sender + log.error(f"No senderName could be found for senderId ({sender_id})") + ind = sender_id.rfind(":") + sender_name = sender_id[ind+1:] + save_name_for_id(sender_name, sender_id) + + soup = BeautifulSoup(content, "html.parser") + + # remove quotes + for script in soup(["quote", "legacyquote"]): + script.extract() + + content = soup.get_text() + + # saves the message + data += [[timestamp, conversation_with_id, conversation_with_name, sender_name, outgoing, content, '', '']] + + if len(data) >= MAX_EXPORTED_MESSAGES: + log.warning(f'Reached max exported messages limit of {MAX_EXPORTED_MESSAGES}. Increase limit in order to parse all messages.') + return data + elif message_type == "Text": + if not sender_name: + sender_name = id_to_name(sender_id) + if sender_name is None: + # unknown sender + log.error(f"No senderName could be found for senderId ({sender_id})") + ind = sender_id.rfind(":") + sender_name = sender_id[ind+1:] + save_name_for_id(sender_name, sender_id) + + # saves the message + data += [[timestamp, conversation_with_id, conversation_with_name, sender_name, outgoing, content, '', '']] + + if len(data) >= MAX_EXPORTED_MESSAGES: + log.warning(f'Reached max exported messages limit of {MAX_EXPORTED_MESSAGES}. Increase limit in order to parse all messages.') + return data + return data + + +def read_archive(file_path): + log.info(f'Reading archive file {file_path}...') + with open(file_path, encoding='utf-8') as f: + archive = json.loads(f.read()) + return archive + diff --git a/parsers/utils.py b/parsers/utils.py index 8e5cad3..2d1c550 100755 --- a/parsers/utils.py +++ b/parsers/utils.py @@ -18,7 +18,6 @@ def timestamp_to_ordinal(value): return datetime.datetime.fromtimestamp(float(value)).toordinal() - def detect_language(df, min_token_count=5): """Detects language of input text""" for name, group in df.groupby(df.conversationWithName): diff --git a/raw_data/skype/.gitignore b/raw_data/skype/.gitignore new file mode 100644 index 0000000..5e7d273 --- /dev/null +++ b/raw_data/skype/.gitignore @@ -0,0 +1,4 @@ +# Ignore everything in this directory +* +# Except this file +!.gitignore diff --git a/utils.py b/utils.py index 50659a6..a63a2a8 100644 --- a/utils.py +++ b/utils.py @@ -16,7 +16,7 @@ def __init__(self, **kwargs): def add_load_data_args(parser): """Adds common data loader arguments to arg parser""" - platforms = ['telegram', 'whatsapp', 'messenger', 'hangouts'] + platforms = ['telegram', 'whatsapp', 'messenger', 'hangouts', 'skype'] parser.add_argument('-p', '--platforms', default=platforms, choices=platforms, nargs='+', help='Use data only from certain platforms') parser.add_argument('--filter-conversation', dest='filter_conversation', nargs='+', default=[], help='Limit by conversations with this person/group') parser.add_argument('--filter-sender', dest='filter_sender', nargs='+', default=[], help='Limit by messages by this sender')