diff --git a/inb4404.py b/inb4404.py old mode 100755 new mode 100644 index 2bdd106..cca3dff --- a/inb4404.py +++ b/inb4404.py @@ -1,14 +1,28 @@ #!/usr/bin/python3 -import urllib.request, urllib.error, urllib.parse, argparse, logging -import os, re, time -import http.client +import argparse import fileinput -from multiprocessing import Process +import http.client +import logging +import os +import re +import time +import urllib.error +import urllib.parse +import urllib.request +from multiprocessing import Process, Manager log = logging.getLogger('inb4404') workpath = os.path.dirname(os.path.realpath(__file__)) args = None +queue_cleanup_timer = 30 # in seconds, how often to check for dead links and mark them dead in the config file +thread_check_timer = 20 # in seconds, how often to queue up all threads to check for new content + +manager = Manager() # getting a manager object we can use to create managed data types +tasks_to_accomplish = manager.list() # queue for threads to pull work out of +links_to_remove = manager.list() # queue used to keep track of threads to remove from config + + def main(): global args parser = argparse.ArgumentParser(description='inb4404') @@ -19,12 +33,13 @@ def main(): parser.add_argument('-n', '--use-names', action='store_true', help='use thread names instead of the thread ids (...4chan.org/board/thread/thread-id/thread-name)') parser.add_argument('-r', '--reload', action='store_true', help='reload the queue file every 5 minutes') parser.add_argument('-t', '--title', action='store_true', help='save original filenames') + parser.add_argument('-p', '--parallel-threads', type=int, default=4, help='number of parallel threads to run at once. (default=4)') args = parser.parse_args() if args.date: logging.basicConfig(level=logging.INFO, format='[%(asctime)s] %(message)s', datefmt='%Y-%m-%d %I:%M:%S %p') else: - logging.basicConfig(level=logging.INFO, format='[%(asctime)s] %(message)s', datefmt='%I:%M:%S %p') + logging.basicConfig(level=logging.INFO, format='[%(asctime)s] %(message)s', datefmt='%I:%M:%S %p') if args.title: try: @@ -36,14 +51,18 @@ def main(): thread = args.thread[0].strip() if thread[:4].lower() == 'http': - download_thread(thread, args) + while True: + download_thread(thread, args) + time.sleep(20) else: download_from_file(thread) + def load(url): req = urllib.request.Request(url, headers={'User-Agent': '4chan Browser'}) return urllib.request.urlopen(req).read() + def get_title_list(html_content): ret = list() @@ -52,7 +71,7 @@ def get_title_list(html_content): divs = parsed.find_all("div", {"class": "fileText"}) for i in divs: - current_child = i.findChildren("a", recursive = False)[0] + current_child = i.findChildren("a", recursive=False)[0] try: ret.append(current_child["title"]) except KeyError: @@ -60,122 +79,143 @@ def get_title_list(html_content): return ret -def call_download_thread(thread_link, args): - try: - download_thread(thread_link, args) - except KeyboardInterrupt: - pass def download_thread(thread_link, args): board = thread_link.split('/')[3] thread = thread_link.split('/')[5].split('#')[0] if len(thread_link.split('/')) > 6: thread_tmp = thread_link.split('/')[6].split('#')[0] - - if args.use_names or os.path.exists(os.path.join(workpath, 'downloads', board, thread_tmp)): + if args.use_names or os.path.exists(os.path.join(workpath, 'downloads', board, thread_tmp)): thread = thread_tmp - while True: - try: - regex = '(\/\/i(?:s|)\d*\.(?:4cdn|4chan)\.org\/\w+\/(\d+\.(?:jpg|png|gif|webm)))' - html_result = load(thread_link).decode('utf-8') - regex_result = list(set(re.findall(regex, html_result))) - regex_result = sorted(regex_result, key=lambda tup: tup[1]) - regex_result_len = len(regex_result) - regex_result_cnt = 1 + try: + regex = r'(\/\/i(?:s|)\d*\.(?:4cdn|4chan)\.org\/\w+\/(\d+\.(?:jpg|png|gif|webm)))' + html_result = load(thread_link).decode('utf-8') + regex_result = list(set(re.findall(regex, html_result))) + regex_result = sorted(regex_result, key=lambda tup: tup[1]) + regex_result_len = len(regex_result) + regex_result_cnt = 1 - directory = os.path.join(workpath, 'downloads', board, thread) - if not os.path.exists(directory): - os.makedirs(directory) + directory = os.path.join(workpath, 'downloads', board, thread) + if not os.path.exists(directory): + os.makedirs(directory) - if args.title: - all_titles = get_title_list(html_result) - - for enum_index, enum_tuple in enumerate(regex_result): - link, img = enum_tuple - - if args.title: - img = all_titles[enum_index] - from django.utils.text import get_valid_filename - img_path = os.path.join(directory, get_valid_filename(img)) - else: - img_path = os.path.join(directory, img) - - if not os.path.exists(img_path): - data = load('https:' + link) - - output_text = board + '/' + thread + '/' + img - if args.with_counter: - output_text = '[' + str(regex_result_cnt).rjust(len(str(regex_result_len))) + '/' + str(regex_result_len) + '] ' + output_text - - log.info(output_text) - - with open(img_path, 'wb') as f: - f.write(data) - - ################################################################################## - # saves new images to a seperate directory - # if you delete them there, they are not downloaded again - # if you delete an image in the 'downloads' directory, it will be downloaded again - copy_directory = os.path.join(workpath, 'new', board, thread) - if not os.path.exists(copy_directory): - os.makedirs(copy_directory) - copy_path = os.path.join(copy_directory, img) - with open(copy_path, 'wb') as f: - f.write(data) - ################################################################################## - regex_result_cnt += 1 + if args.title: + all_titles = get_title_list(html_result) + + for enum_index, enum_tuple in enumerate(regex_result): + link, img = enum_tuple + if args.title: + img = all_titles[enum_index] + from django.utils.text import get_valid_filename + img_path = os.path.join(directory, get_valid_filename(img)) + else: + img_path = os.path.join(directory, img) + + if not os.path.exists(img_path): + data = load('https:' + link) + + output_text = board + '/' + thread + '/' + img + if args.with_counter: + output_text = '[' + str(regex_result_cnt).rjust(len(str(regex_result_len))) + '/' + str(regex_result_len) + '] ' + output_text + + log.info(output_text) + + with open(img_path, 'wb') as f: + f.write(data) + + ################################################################################## + # saves new images to a seperate directory + # if you delete them there, they are not downloaded again + # if you delete an image in the 'downloads' directory, it will be downloaded again + copy_directory = os.path.join(workpath, 'new', board, thread) + if not os.path.exists(copy_directory): + os.makedirs(copy_directory) + copy_path = os.path.join(copy_directory, img) + with open(copy_path, 'wb') as f: + f.write(data) + ################################################################################## + regex_result_cnt += 1 + + except urllib.error.HTTPError: + time.sleep(10) + try: + load(thread_link) except urllib.error.HTTPError: - time.sleep(10) - try: - load(thread_link) - except urllib.error.HTTPError: - log.info('%s 404\'d', thread_link) - break - continue - except (urllib.error.URLError, http.client.BadStatusLine, http.client.IncompleteRead): - log.fatal(thread_link + ' crashed!') - raise - - if not args.less: - log.info('Checking ' + board + '/' + thread) - time.sleep(20) + log.info('%s 404\'d', thread_link) + links_to_remove.append(thread_link) + except (urllib.error.URLError, http.client.BadStatusLine, http.client.IncompleteRead): + log.fatal(thread_link + ' crashed!') -def download_from_file(filename): - running_links = [] + if not args.less: + log.info('Checking ' + board + '/' + thread) + +def call_download_thread(queue, args): while True: - processes = [] - for link in [_f for _f in [line.strip() for line in open(filename) if line[:4] == 'http'] if _f]: - if link not in running_links: - running_links.append(link) - log.info('Added ' + link) - - process = Process(target=call_download_thread, args=(link, args, )) - process.start() - processes.append([process, link]) - - if len(processes) == 0: - log.warning(filename + ' empty') - - if args.reload: - time.sleep(60 * 5) # 5 minutes - links_to_remove = [] - for process, link in processes: - if not process.is_alive(): - links_to_remove.append(link) - else: - process.terminate() - - for link in links_to_remove: - for line in fileinput.input(filename, inplace=True): - print(line.replace(link, '-' + link), end='') - running_links.remove(link) - log.info('Removed ' + link) - if not args.less: - log.info('Reloading ' + args.thread[0]) # thread = filename here; reloading on next loop - else: + try: + if len(queue) == 0: # check if there are any jobs waiting + time.sleep(0.25) # sleep to prevent while loop from dominating CPU + else: + download_thread(queue.pop(0), args) + except KeyboardInterrupt: break + except: + pass + + +def download_from_file(filename): + processes = [] + running_links = [] # 4chan threads to check periodically + + last_config_reload = time.time() + last_queue_check = time.time() + + while len(processes) < args.parallel_threads: + p = Process(target=call_download_thread, args=(tasks_to_accomplish, args)) + processes.append(p) + p.start() + + try: + while True: + for link in [line.strip() for line in open(filename) if line[:4] == 'http']: + if link not in running_links: + running_links.append(link) + tasks_to_accomplish.append(link) + log.info('Added ' + link) + + # if enough time has passed, recheck list of running threads + if time.time() >= (last_queue_check + thread_check_timer): + for i in running_links: + if i not in tasks_to_accomplish: # check if the link we're adding is already in the queue. only add if it is not + tasks_to_accomplish.append(i) + last_queue_check = time.time() + + # check if there are any links that have died, and mark them as dead so they are no longer checked + if args.reload and time.time() >= (last_config_reload + queue_cleanup_timer): + for link in links_to_remove: + for line in fileinput.input(filename, inplace=True): + print(line.replace(link, '-' + link), end='') + running_links.remove(link) + links_to_remove.remove(link) + log.info('Removed ' + link) + if not args.less: + log.info('Reloading ' + args.thread[0]) # thread = filename here; reloading on next loop + last_config_reload = time.time() + + # if, for some reason, we do not have the required amount of threads running, spin up new threads + while len(processes) < args.parallel_threads: + p = Process(target=call_download_thread, args=(tasks_to_accomplish, args)) + processes.append(p) + p.start() + + # check for any threads that have completed + for process in processes: + process.join(0.25) # this will clean up any processes that exited/crashed somehow, while also blocking for .25 seconds + + except KeyboardInterrupt: + for p in processes: # close processes + p.terminate() if __name__ == '__main__': @@ -183,4 +223,3 @@ def download_from_file(filename): main() except KeyboardInterrupt: pass -