Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 20 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,13 @@ Can also generate histograms and word clouds from the chat logs.

### Support Matrix

| Platform | Direct Chat | Group Chat |
|:------------------:|:-----------: |:----------:|
| Platform | Direct Chat | Group Chat |
|:------------------:|:------------:|:-----------:|
| Facebook Messenger | ✔ | ✘ |
| Google Hangouts | ✔ | ✘ |
| Telegram | ✔ | ✘ |
| WhatsApp | ✔ | ✔ |
| Skype | ✔ | ✔ |

### Exported data

Expand Down Expand Up @@ -80,6 +81,18 @@ Unfortunately, WhatsApp only lets you export your conversations **from your phon

The Telegram API works differently: you will first need to setup Chatistics, then query your chat logs programmatically. This process is documented below. Exporting Telegram chat logs is very fast.

### Skype

**Warning:** Skype archives can take a long time to be ready for download - up to one hour in our experience.

1. Login to your Skype account: https://go.skype.com/export
2. Select the option to download your Conversations, and then select Submit request
3. When your request is complete, you'll receive a notification in Skype with a link to view or download your file.
If you don't receive a notification in Skype, check the [export page](http://go.skype.com/export).
A link to download your files will also appear there once they are available to download.
4. Click the Download button to download your files
5. Extract the file called `messages.json` to `./raw_data/skype/`

## 2. Setup Chatistics

First, install the required Python packages using conda:
Expand All @@ -102,6 +115,9 @@ python parse.py messenger

# WhatsApp
python parse.py whatsapp

# Skype
python parse.py skype
```

### Telegram
Expand Down Expand Up @@ -143,8 +159,8 @@ Plot all messages with:
Among other options you can filter messages as needed (also see `python visualize.py breakdown --help`):

```
--platforms {telegram,whatsapp,messenger,hangouts}
Use data only from certain platforms (default: ['telegram', 'whatsapp', 'messenger', 'hangouts'])
--platforms {telegram,whatsapp,messenger,hangouts,skype}
Use data only from certain platforms (default: ['telegram', 'whatsapp', 'messenger', 'hangouts', 'skype'])
--filter-conversation
Limit by conversations with this person/group (default: [])
--filter-sender
Expand Down
3 changes: 3 additions & 0 deletions config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,6 @@ telegram:
whatsapp:
DEFAULT_RAW_LOCATION: 'raw_data/whatsapp'
OUTPUT_PICKLE_NAME: 'whatsapp.pkl'
skype:
DEFAULT_RAW_LOCATION: 'raw_data/skype/messages.json'
OUTPUT_PICKLE_NAME: 'skype.pkl'
1 change: 1 addition & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,6 @@ dependencies:
- pip==19.3.1
- seaborn==0.9.0
- wordcloud==1.6.0
- beautifulsoup4==4.8.2
- pip:
- telethon==1.10.9
10 changes: 10 additions & 0 deletions parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
hangouts Parse logs from hangouts
messenger Parse logs from messenger
whatsapp Parse logs from whatsapp
skype Parse logs from skype
"""


Expand Down Expand Up @@ -79,6 +80,15 @@ def whatsapp(self):
args = parser.parse_args(sys.argv[2:])
main(args.own_name, args.file_path, args.max, args.infer_datetime)

def skype(self):
from parsers.skype import main
parser = ArgParseDefault(description='Parse message logs from Skype')
parser = add_common_parse_arguments(parser)
parser.add_argument('-f', '--file-path', dest='file_path', default=config['skype']['DEFAULT_RAW_LOCATION'],
help='Path to Skype chat log file (json file)')
args = parser.parse_args(sys.argv[2:])
main(args.own_name, args.file_path, args.max)


if __name__ == '__main__':
ArgParse()
143 changes: 143 additions & 0 deletions parsers/skype.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
from parsers.config import config
from parsers.utils import export_dataframe, detect_language
import json
import pandas as pd
import logging
from bs4 import BeautifulSoup
from collections import defaultdict
from dateutil.parser import parse
import os
import html
import warnings

log = logging.getLogger(__name__)

warnings.filterwarnings("ignore", category=UserWarning, module='bs4')

def main(own_name, file_path, max_exported_messages):
global MAX_EXPORTED_MESSAGES
MAX_EXPORTED_MESSAGES = max_exported_messages
log.info('Parsing Skype data...')
if not os.path.isfile(file_path):
log.error(f'No input file under {file_path}')
exit(0)
archive = read_archive(file_path)
own_id = archive["userId"]
if own_name is None:
ind = own_id.rfind(":")
own_name = own_id[ind+1:]
data = parse_messages(archive, own_id, own_name)
log.info('{:,} messages parsed.'.format(len(data)))
if len(data) < 1:
log.info('Nothing to save.')
exit(0)
log.info('Converting to DataFrame...')
df = pd.DataFrame(data, columns=config['ALL_COLUMNS'])
df['platform'] = 'skype'
log.info('Detecting languages...')
df = detect_language(df)
export_dataframe(df, config['skype']['OUTPUT_PICKLE_NAME'])
log.info('Done.')


def parse_messages(archive, own_id, own_name):
def id_to_name(_id):
if _id in names:
return names[_id]
else:
return None

def save_name_for_id(name, _id):
if not _id in names:
names[_id] = name
elif names[_id] != name:
log.info(f'Assuming {name} is {names[_id]}')

names = {}
data = []
log.info('Extracting names...')
for conversation in archive["conversations"]:
if conversation["threadProperties"]:
for message in conversation["MessageList"]:
sender_id = message["from"]
sender_name = message["displayName"]
if sender_name:
sender_name = html.unescape(sender_name)
save_name_for_id(sender_name, sender_id)
else:
conversation_with_id = conversation["id"]
conversation_with_name = conversation["displayName"]
if conversation_with_name:
conversation_with_name = html.unescape(conversation_with_name)
save_name_for_id(conversation_with_name, conversation_with_id)

save_name_for_id(own_name, own_id)

log.info('Extracting messages...')
for conversation in archive["conversations"]:
conversation_with_id = conversation["id"]
conversation_with_name = conversation["displayName"]
if conversation_with_name:
conversation_with_name = html.unescape(conversation_with_name)
else:
# If conversation_with_name is None we are collecting caller log files -> skip
continue
for message in conversation["MessageList"]:
message_type = message["messagetype"]
timestamp = parse(message["originalarrivaltime"]).timestamp()
content = message["content"]
sender_id = message["from"]
sender_name = message["displayName"]
if sender_name:
sender_name = html.unescape(sender_name)
outgoing = sender_id == own_id

if message_type == "RichText":
if not sender_name:
sender_name = id_to_name(sender_id)
if sender_name is None:
# unknown sender
log.error(f"No senderName could be found for senderId ({sender_id})")
ind = sender_id.rfind(":")
sender_name = sender_id[ind+1:]
save_name_for_id(sender_name, sender_id)

soup = BeautifulSoup(content, "html.parser")

# remove quotes
for script in soup(["quote", "legacyquote"]):
script.extract()

content = soup.get_text()

# saves the message
data += [[timestamp, conversation_with_id, conversation_with_name, sender_name, outgoing, content, '', '']]

if len(data) >= MAX_EXPORTED_MESSAGES:
log.warning(f'Reached max exported messages limit of {MAX_EXPORTED_MESSAGES}. Increase limit in order to parse all messages.')
return data
elif message_type == "Text":
if not sender_name:
sender_name = id_to_name(sender_id)
if sender_name is None:
# unknown sender
log.error(f"No senderName could be found for senderId ({sender_id})")
ind = sender_id.rfind(":")
sender_name = sender_id[ind+1:]
save_name_for_id(sender_name, sender_id)

# saves the message
data += [[timestamp, conversation_with_id, conversation_with_name, sender_name, outgoing, content, '', '']]

if len(data) >= MAX_EXPORTED_MESSAGES:
log.warning(f'Reached max exported messages limit of {MAX_EXPORTED_MESSAGES}. Increase limit in order to parse all messages.')
return data
return data


def read_archive(file_path):
log.info(f'Reading archive file {file_path}...')
with open(file_path, encoding='utf-8') as f:
archive = json.loads(f.read())
return archive

1 change: 0 additions & 1 deletion parsers/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ def timestamp_to_ordinal(value):
return datetime.datetime.fromtimestamp(float(value)).toordinal()



def detect_language(df, min_token_count=5):
"""Detects language of input text"""
for name, group in df.groupby(df.conversationWithName):
Expand Down
4 changes: 4 additions & 0 deletions raw_data/skype/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Ignore everything in this directory
*
# Except this file
!.gitignore
2 changes: 1 addition & 1 deletion utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def __init__(self, **kwargs):

def add_load_data_args(parser):
"""Adds common data loader arguments to arg parser"""
platforms = ['telegram', 'whatsapp', 'messenger', 'hangouts']
platforms = ['telegram', 'whatsapp', 'messenger', 'hangouts', 'skype']
parser.add_argument('-p', '--platforms', default=platforms, choices=platforms, nargs='+', help='Use data only from certain platforms')
parser.add_argument('--filter-conversation', dest='filter_conversation', nargs='+', default=[], help='Limit by conversations with this person/group')
parser.add_argument('--filter-sender', dest='filter_sender', nargs='+', default=[], help='Limit by messages by this sender')
Expand Down