MasterScrat · KindDragon · Feb 19, 2020 · Feb 19, 2020 · Apr 25, 2020 · Apr 25, 2020
diff --git a/README.md b/README.md
@@ -20,12 +20,13 @@ Can also generate histograms and word clouds from the chat logs.
 
 ### Support Matrix
 
-|      Platform      | Direct Chat  | Group Chat |
-|:------------------:|:-----------: |:----------:|
+|      Platform      | Direct Chat  | Group Chat  |
+|:------------------:|:------------:|:-----------:|
 | Facebook Messenger |     ✔        |     ✘      |
 | Google Hangouts    |     ✔        |     ✘      |
 | Telegram           |     ✔        |     ✘      |
 | WhatsApp           |     ✔        |     ✔      |
+| Skype              |     ✔        |     ✔      |
 
 ### Exported data
 
@@ -80,6 +81,18 @@ Unfortunately, WhatsApp only lets you export your conversations **from your phon
 
 The Telegram API works differently: you will first need to setup Chatistics, then query your chat logs programmatically. This process is documented below. Exporting Telegram chat logs is very fast.
 
+### Skype
+
+**Warning:** Skype archives can take a long time to be ready for download - up to one hour in our experience.
+
+1. Login to your Skype account: https://go.skype.com/export
+2. Select the option to download your Conversations, and then select Submit request
+3. When your request is complete, you'll receive a notification in Skype with a link to view or download your file.
+   If you don't receive a notification in Skype, check the [export page](http://go.skype.com/export). 
+   A link to download your files will also appear there once they are available to download.
+4. Click the Download button to download your files
+5. Extract the file called `messages.json` to `./raw_data/skype/`
+
 ## 2. Setup Chatistics
 
 First, install the required Python packages using conda:
@@ -102,6 +115,9 @@ python parse.py messenger
 
 # WhatsApp
 python parse.py whatsapp
+
+# Skype
+python parse.py skype
 ```
 
 ### Telegram
@@ -143,8 +159,8 @@ Plot all messages with:
 Among other options you can filter messages as needed (also see `python visualize.py breakdown --help`):
 
 ```
-  --platforms {telegram,whatsapp,messenger,hangouts}
-                        Use data only from certain platforms (default: ['telegram', 'whatsapp', 'messenger', 'hangouts'])
+  --platforms {telegram,whatsapp,messenger,hangouts,skype}
+                        Use data only from certain platforms (default: ['telegram', 'whatsapp', 'messenger', 'hangouts', 'skype'])
   --filter-conversation
                         Limit by conversations with this person/group (default: [])
   --filter-sender

diff --git a/config.yml b/config.yml
@@ -21,3 +21,6 @@ telegram:
 whatsapp:
   DEFAULT_RAW_LOCATION: 'raw_data/whatsapp'
   OUTPUT_PICKLE_NAME: 'whatsapp.pkl'
+skype:
+  DEFAULT_RAW_LOCATION: 'raw_data/skype/messages.json'
+  OUTPUT_PICKLE_NAME: 'skype.pkl'
diff --git a/environment.yml b/environment.yml
@@ -11,5 +11,6 @@ dependencies:
   - pip==19.3.1
   - seaborn==0.9.0
   - wordcloud==1.6.0
+  - beautifulsoup4==4.8.2
   - pip:
       - telethon==1.10.9
diff --git a/parse.py b/parse.py
@@ -12,6 +12,7 @@
   hangouts         Parse logs from hangouts
   messenger        Parse logs from messenger
   whatsapp         Parse logs from whatsapp
+  skype            Parse logs from skype
 """
 
 
@@ -79,6 +80,15 @@ def whatsapp(self):
         args = parser.parse_args(sys.argv[2:])
         main(args.own_name, args.file_path, args.max, args.infer_datetime)
 
+    def skype(self):
+        from parsers.skype import main
+        parser = ArgParseDefault(description='Parse message logs from Skype')
+        parser = add_common_parse_arguments(parser)
+        parser.add_argument('-f', '--file-path', dest='file_path', default=config['skype']['DEFAULT_RAW_LOCATION'],
+                            help='Path to Skype chat log file (json file)')
+        args = parser.parse_args(sys.argv[2:])
+        main(args.own_name, args.file_path, args.max)
+
 
 if __name__ == '__main__':
     ArgParse()
diff --git a/parsers/skype.py b/parsers/skype.py
@@ -0,0 +1,143 @@
+from parsers.config import config
+from parsers.utils import export_dataframe, detect_language
+import json
+import pandas as pd
+import logging
+from bs4 import BeautifulSoup
+from collections import defaultdict
+from dateutil.parser import parse
+import os
+import html
+import warnings
+
+log = logging.getLogger(__name__)
+
+warnings.filterwarnings("ignore", category=UserWarning, module='bs4')
+
+def main(own_name, file_path, max_exported_messages):
+    global MAX_EXPORTED_MESSAGES
+    MAX_EXPORTED_MESSAGES = max_exported_messages
+    log.info('Parsing Skype data...')
+    if not os.path.isfile(file_path):
+        log.error(f'No input file under {file_path}')
+        exit(0)
+    archive = read_archive(file_path)
+    own_id = archive["userId"]
+    if own_name is None:
+        ind = own_id.rfind(":")
+        own_name = own_id[ind+1:]
+    data = parse_messages(archive, own_id, own_name)
+    log.info('{:,} messages parsed.'.format(len(data)))
+    if len(data) < 1:
+        log.info('Nothing to save.')
+        exit(0)
+    log.info('Converting to DataFrame...')
+    df = pd.DataFrame(data, columns=config['ALL_COLUMNS'])
+    df['platform'] = 'skype'
+    log.info('Detecting languages...')
+    df = detect_language(df)
+    export_dataframe(df, config['skype']['OUTPUT_PICKLE_NAME'])
+    log.info('Done.')
+
+
+def parse_messages(archive, own_id, own_name):
+    def id_to_name(_id):
+        if _id in names:
+            return names[_id]
+        else:
+            return None
+
+    def save_name_for_id(name, _id):
+        if not _id in names:
+            names[_id] = name
+        elif names[_id] != name:
+            log.info(f'Assuming {name} is {names[_id]}')
+
+    names = {}
+    data = []
+    log.info('Extracting names...')
+    for conversation in archive["conversations"]:
+        if conversation["threadProperties"]:
+            for message in conversation["MessageList"]:
+                sender_id = message["from"]
+                sender_name = message["displayName"]
+                if sender_name:
+                    sender_name = html.unescape(sender_name)
+                    save_name_for_id(sender_name, sender_id)
+        else:
+            conversation_with_id = conversation["id"]
+            conversation_with_name = conversation["displayName"]
+            if conversation_with_name:
+                conversation_with_name = html.unescape(conversation_with_name)
+                save_name_for_id(conversation_with_name, conversation_with_id)
+
+    save_name_for_id(own_name, own_id)
+
+    log.info('Extracting messages...')
+    for conversation in archive["conversations"]:
+        conversation_with_id = conversation["id"]
+        conversation_with_name = conversation["displayName"]
+        if conversation_with_name:
+            conversation_with_name = html.unescape(conversation_with_name)
+        else:
+            # If conversation_with_name is None we are collecting caller log files -> skip
+            continue
+        for message in conversation["MessageList"]:
+            message_type = message["messagetype"]
+            timestamp = parse(message["originalarrivaltime"]).timestamp()
+            content = message["content"]
+            sender_id = message["from"]
+            sender_name = message["displayName"]
+            if sender_name:
+                sender_name = html.unescape(sender_name)
+            outgoing = sender_id == own_id
+
+            if message_type == "RichText":
+                if not sender_name:
+                    sender_name = id_to_name(sender_id)
+                if sender_name is None:
+                    # unknown sender
+                    log.error(f"No senderName could be found for senderId ({sender_id})")
+                    ind = sender_id.rfind(":")
+                    sender_name = sender_id[ind+1:]
+                    save_name_for_id(sender_name, sender_id)
+
+                soup = BeautifulSoup(content, "html.parser")
+
+                # remove quotes
+                for script in soup(["quote", "legacyquote"]):
+                    script.extract()
+
+                content = soup.get_text()
+
+                # saves the message
+                data += [[timestamp, conversation_with_id, conversation_with_name, sender_name, outgoing, content, '', '']]
+
+                if len(data) >= MAX_EXPORTED_MESSAGES:
+                    log.warning(f'Reached max exported messages limit of {MAX_EXPORTED_MESSAGES}. Increase limit in order to parse all messages.')
+                    return data
+            elif message_type == "Text":
+                if not sender_name:
+                    sender_name = id_to_name(sender_id)
+                if sender_name is None:
+                    # unknown sender
+                    log.error(f"No senderName could be found for senderId ({sender_id})")
+                    ind = sender_id.rfind(":")
+                    sender_name = sender_id[ind+1:]
+                    save_name_for_id(sender_name, sender_id)
+
+                # saves the message
+                data += [[timestamp, conversation_with_id, conversation_with_name, sender_name, outgoing, content, '', '']]
+
+                if len(data) >= MAX_EXPORTED_MESSAGES:
+                    log.warning(f'Reached max exported messages limit of {MAX_EXPORTED_MESSAGES}. Increase limit in order to parse all messages.')
+                    return data
+    return data
+
+
+def read_archive(file_path):
+    log.info(f'Reading archive file {file_path}...')
+    with open(file_path, encoding='utf-8') as f:
+        archive = json.loads(f.read())
+    return archive
+
diff --git a/parsers/utils.py b/parsers/utils.py
@@ -18,7 +18,6 @@ def timestamp_to_ordinal(value):
     return datetime.datetime.fromtimestamp(float(value)).toordinal()
 
 
-
 def detect_language(df, min_token_count=5):
     """Detects language of input text"""
     for name, group in df.groupby(df.conversationWithName):

diff --git a/raw_data/skype/.gitignore b/raw_data/skype/.gitignore
@@ -0,0 +1,4 @@
+# Ignore everything in this directory
+*
+# Except this file
+!.gitignore
diff --git a/utils.py b/utils.py
@@ -16,7 +16,7 @@ def __init__(self, **kwargs):
 
 def add_load_data_args(parser):
     """Adds common data loader arguments to arg parser"""
-    platforms = ['telegram', 'whatsapp', 'messenger', 'hangouts']
+    platforms = ['telegram', 'whatsapp', 'messenger', 'hangouts', 'skype']
     parser.add_argument('-p', '--platforms', default=platforms, choices=platforms, nargs='+', help='Use data only from certain platforms')
     parser.add_argument('--filter-conversation', dest='filter_conversation', nargs='+', default=[], help='Limit by conversations with this person/group')
     parser.add_argument('--filter-sender', dest='filter_sender', nargs='+', default=[], help='Limit by messages by this sender')