-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbookmarkarchiver.py
More file actions
104 lines (89 loc) · 5.26 KB
/
bookmarkarchiver.py
File metadata and controls
104 lines (89 loc) · 5.26 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
"""Python script that archives all of your bookmarks on the Internet Archive. Supports all major browsers."""
import argparse
import re
import secrets
import time
import requests
import json
import browser_cookie3
JSON_HEADER = {"Accept": "application/json"}
KNOWN_ERRORS = (KeyError, requests.exceptions.ConnectionError, json.JSONDecodeError)
START_TIME = time.time()
def wait_for_api(available):
"""Wait until there are a number of free concurrent requests."""
wait_time = 1
status = requests.get("https://web.archive.org/save/status/user", data={"_t": secrets.token_urlsafe(16)}, cookies=cookies, headers=JSON_HEADER).json()
while status["available"] < available:
print(f"API Limit: sleeping for {wait_time} seconds")
time.sleep(wait_time)
wait_time *= 2
status = requests.get("https://web.archive.org/save/status/user", data={"_t": secrets.token_urlsafe(16)}, cookies=cookies, headers=JSON_HEADER).json()
def main():
# parse and validate command-line arguments
parser = argparse.ArgumentParser(description="Archives your bookmarks with the Wayback Machine.")
parser.add_argument("bookmark_file", help="A Netscape format bookmarks file", type=str)
parser.add_argument("--no_capture_all", "-n", default=True, action="store_false", help="Don't capture error pages")
parser.add_argument("--capture_outlinks", "-c", default=False, action="store_true", help="Capture all outlinks")
parser.add_argument("--capture_screenshot", "-s", default=False, action="store_true", help="Capture a screenshot")
parser.add_argument("--delay_wb_availability", "-d", default=False, action="store_true", help="Delay uploading capture")
parser.add_argument("--force_get", "-f", default=False, action="store_true", help="Force a GET request")
parser.add_argument("--no_skip_first_archive", "-a", default=True, action="store_false", help="Don't recapture pages")
parser.add_argument("--email_result", "-e", default=False, action="store_true", help="Email results to user")
parser.add_argument("--quit_immediately", "-q", default=False, action="store_true", help="Don't show end results")
parser.add_argument("--api_wait_seconds", "-w", default=8, type=int)
arguments = vars(parser.parse_args())
assert arguments["api_wait_seconds"] >= 0, "api_wait_seconds cannot be negative"
data = {"bookmark_file": arguments["bookmark_file"]}
for key in arguments.keys():
if arguments[key] is True and key not in ["api_wait_seconds"]:
data[key] = str(int(arguments[key]))
print("Setting Up: command-line arguments processed")
# parse netscape-format bookmark file
with open(data['bookmark_file'], "r", encoding="utf-8") as file:
bookmark_file = file.read()
bookmark_urls = re.findall(r'(?:HREF=\")(.+?)(?:\")', bookmark_file)
bookmark_names = re.findall(r'(?:\">)(.+?)(?:</A>)', bookmark_file)
bookmarks = [{"name": bookmark_names[i], "url": bookmark_urls[i]} for i in range(len(bookmark_urls))]
print(f"Setting Up: {len(bookmarks)} browser bookmarks queued")
# load cookies from chrome or firefox
global cookies
cookies = browser_cookie3.load()
print("Setting Up: chrome/firefox cookies loaded")
# try to save every bookmark
for bookmark in bookmarks:
try:
wait_for_api(1)
print(f"Capturing: {bookmark['name']}")
data["url"] = bookmark["url"]
request = requests.post("https://web.archive.org/save", data=data, cookies=cookies, headers=JSON_HEADER).json()
bookmark["job_id"] = request["job_id"]
except KNOWN_ERRORS as error:
print(f"Error: {request.get('status_ext', 'unknown network error')}")
print(f"Error: {str(error)} during processing")
try: # shortened version, retries
wait_for_api(1)
print(f"Retrying: {bookmark['name']}")
request = requests.post("https://web.archive.org/save", data=data, cookies=cookies, headers=JSON_HEADER).json()
bookmark["job_id"] = request["job_id"]
except KNOWN_ERRORS as error:
print(f"Error: {request.get('status_ext', 'unknown network error')}")
print(f"Error: {str(error)} during retry")
time.sleep(arguments["api_wait_seconds"]) # needed because the api lags
# short-circuit and quit
if arguments["quit_immediately"]:
raise SystemExit
# clean up and actually quit
print("Finishing: waiting for completion")
wait_for_api(6)
job_ids = [bookmark.get("job_id", "placeholder-job-id") for bookmark in bookmarks]
job_ids = [requests.get(f"https://web.archive.org/save/status/{job_id}", cookies=cookies, headers=JSON_HEADER).json()["status"] for job_id in job_ids]
print(f"Information: {len(job_ids)} bookmarks in total")
print(f"Information: {job_ids.count('success')} bookmarks archived")
if len(job_ids) - job_ids.count("success"):
print(f"Information: {len(job_ids) - job_ids.count('success')} failed; check logs")
for i, job_id in enumerate(job_ids):
if job_id != "success":
print(f"Information: {bookmarks[i]['name']} failed, status {job_id}")
print(f"Information: finished in {time.time() - START_TIME} seconds")
if __name__ == "__main__":
main()