diff --git a/__init__.py b/__init__.py new file mode 100644 index 00000000..9ae757d9 --- /dev/null +++ b/__init__.py @@ -0,0 +1,343 @@ +# coding: utf-8 +import logging +from logging.config import dictConfig +import platform +import re +import time +import traceback + +from flask import Flask, current_app, render_template, url_for +from flask_compress import Compress +from logparser import __version__ as LOGPARSER_VERSION + +from .__version__ import __url__, __version__ +from .common import handle_metadata +from .models import Metadata, db +from .vars import PYTHON_VERSION, SQLALCHEMY_BINDS, SQLALCHEMY_DATABASE_URI +# from .utils.scheduler import scheduler + + +# https://stackoverflow.com/questions/18820274/how-to-suppress-sqlalchemy-engine-base-engine-logging-to-stdout +# logging.getLogger('sqlalchemy.engine.base.Engine').propagate = False +logging.getLogger('sqlalchemy.engine.base.Engine').setLevel(logging.WARNING) +# http://flask.pocoo.org/docs/1.0/logging/#basic-configuration +dictConfig({ + 'version': 1, + 'formatters': {'default': { + 'format': '[%(asctime)s] %(levelname)-8s in %(name)s: %(message)s', + }}, + 'handlers': {'wsgi': { + 'class': 'logging.StreamHandler', + 'stream': 'ext://flask.logging.wsgi_errors_stream', + 'formatter': 'default' + }}, + 'root': { + 'level': 'DEBUG', + 'handlers': ['wsgi'] + } +}) + +# Comment out the dictConfig above first +# https://docs.sqlalchemy.org/en/latest/core/engines.html#configuring-logging +# https://apscheduler.readthedocs.io/en/latest/userguide.html#troubleshooting +# logging.basicConfig() +# logging.getLogger('apscheduler').setLevel(logging.DEBUG) +# logging.getLogger('sqlalchemy.engine').setLevel(logging.DEBUG) + + +def internal_server_error(error): + kwargs = dict( + error=error, + traceback=traceback.format_exc(), + url_issues=__url__ + '/issues', + os=platform.platform(), + python_version=PYTHON_VERSION, + scrapydweb_version=__version__, + logparser_version=LOGPARSER_VERSION, + scrapyd_servers_amount=len(current_app.config.get('SCRAPYD_SERVERS', [])) + ) + return render_template('500.html', **kwargs), 500 + + +def create_app(test_config=None): + app = Flask(__name__, instance_relative_config=True) + app.config.from_mapping( + SECRET_KEY='dev', + ) + # http://flask.pocoo.org/docs/1.0/config/#configuring-from-files + app.config.from_object('scrapydweb.default_settings') + + if test_config is None: + # load the instance config, if it exists, when not testing + app.config.from_pyfile('config.py', silent=True) + else: + # load the test config if passed in + app.config.from_mapping(test_config) + + @app.route('/hello') + def hello(): + return 'Hello, World!' + + handle_db(app) + handle_route(app) + handle_template_context(app) + + # @app.errorhandler(404) + # def handle_error(error): + # return ('Nothing Found', 404) + # http://flask.pocoo.org/docs/1.0/patterns/errorpages/ + app.register_error_handler(500, internal_server_error) + + # https://ansible-docs.readthedocs.io/zh/stable-2.0/rst/playbooks_filters.html#other-useful-filters + # https://stackoverflow.com/questions/12791216/how-do-i-use-regular-expressions-in-jinja2 + # https://www.michaelcho.me/article/custom-jinja-template-filters-in-flask + # http://flask.pocoo.org/docs/1.0/api/#flask.Flask.template_filter + @app.template_filter() + def regex_replace(s, find, replace): + return re.sub(find, replace, s) + app.jinja_env.variable_start_string = '{{ ' + app.jinja_env.variable_end_string = ' }}' + + compress = Compress() + compress.init_app(app) + + app.logger.setLevel(logging.DEBUG) + return app + + +def handle_db(app): + # https://flask-sqlalchemy.palletsprojects.com/en/master/config/ + app.config['SQLALCHEMY_DATABASE_URI'] = SQLALCHEMY_DATABASE_URI + app.config['SQLALCHEMY_BINDS'] = SQLALCHEMY_BINDS + app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False # https://stackoverflow.com/a/33790196/10517783 + app.config['SQLALCHEMY_ECHO'] = True # http://flask-sqlalchemy.pocoo.org/2.3/config/ + + # flask_sqlalchemy/__init__.py + # class SQLAlchemy(object): + # def __init__(self, app=None + # self.app = app + # if app is not None: + # self.init_app(app) + db.app = app # https://github.com/viniciuschiele/flask-apscheduler/blob/master/examples/flask_context.py + db.init_app(app) # http://flask-sqlalchemy.pocoo.org/2.3/contexts/ + db.create_all() + + # https://blog.miguelgrinberg.com/post/the-flask-mega-tutorial-part-vii-error-handling + @app.teardown_request + def handle_db_session(exception): + if exception: + db.session.rollback() + db.session.remove() + + with db.app.app_context(): + if not Metadata.query.filter_by(version=__version__).first(): + metadata = Metadata(version=__version__) + db.session.add(metadata) + db.session.commit() + if time.time() - handle_metadata().get('last_check_update_timestamp', time.time()) > 3600 * 24 * 30: + handle_metadata('last_check_update_timestamp', time.time()) + handle_metadata('pageview', 0) + else: + handle_metadata('pageview', 1) + # print(Metadata.query.filter_by(version=__version__).first()) + + +def handle_route(app): + def register_view(view, endpoint, url_defaults_list, with_node=True, trailing_slash=True): + view_func = view.as_view(endpoint) + for url, defaults in url_defaults_list: + rule = '//%s' % url if with_node else '/%s' % url + if trailing_slash: + rule += '/' + if not with_node: + if defaults: + defaults['node'] = 1 + else: + defaults = dict(node=1) + app.add_url_rule(rule, defaults=defaults, view_func=view_func) + + from .views.index import IndexView + index_view = IndexView.as_view('index') + app.add_url_rule('//', view_func=index_view) + app.add_url_rule('/', defaults=dict(node=1), view_func=index_view) + + from .views.api import ApiView + register_view(ApiView, 'api', [ + ('api///', None), + ('api//', dict(version_spider_job=None)), + ('api/', dict(project=None, version_spider_job=None)) + ]) + + from .views.baseview import MetadataView + register_view(MetadataView, 'metadata', [('metadata', None)]) + + # Overview + from .views.overview.servers import ServersView + register_view(ServersView, 'servers', [ + ('servers/getreports///', dict(opt='getreports')), + ('servers////', None), + ('servers///', dict(spider=None)), + ('servers//', dict(version_job=None, spider=None)), + ('servers/', dict(project=None, version_job=None, spider=None)), + ('servers', dict(opt=None, project=None, version_job=None, spider=None)) + ]) + + from .views.overview.multinode import MultinodeView + register_view(MultinodeView, 'multinode', [ + ('multinode///', None), + ('multinode//', dict(version_job=None)) + ]) + + from .views.overview.tasks import TasksView, TasksXhrView + register_view(TasksView, 'tasks', [ + ('tasks//', None), + ('tasks/', dict(task_result_id=None)), + ('tasks', dict(task_id=None, task_result_id=None)) + ]) + register_view(TasksXhrView, 'tasks.xhr', [ + ('tasks/xhr///', None), + ('tasks/xhr//', dict(task_result_id=None)), + ('tasks/xhr/', dict(task_id=None, task_result_id=None)) + ]) + + from .views.overview.tasks import bp as bp_tasks_history + app.register_blueprint(bp_tasks_history) + + # Dashboard + from .views.dashboard.jobs import JobsView, JobsXhrView + register_view(JobsView, 'jobs', [('jobs', None)]) + register_view(JobsXhrView, 'jobs.xhr', [('jobs/xhr//', None)]) + + from .views.dashboard.node_reports import NodeReportsView + register_view(NodeReportsView, 'nodereports', [('nodereports', None)]) + + from .views.dashboard.cluster_reports import ClusterReportsView + register_view(ClusterReportsView, 'clusterreports', [ + ('clusterreports///', None), + ('clusterreports', dict(project=None, spider=None, job=None)) + ]) + + # Operations + from .views.operations.deploy import DeployView, DeployUploadView, DeployXhrView + register_view(DeployView, 'deploy', [('deploy', None)]) + register_view(DeployUploadView, 'deploy.upload', [('deploy/upload', None)]) + register_view(DeployXhrView, 'deploy.xhr', [('deploy/xhr///', None)]) + + from .views.operations.schedule import (ScheduleView, ScheduleCheckView, ScheduleRunView, + ScheduleXhrView, ScheduleTaskView) + register_view(ScheduleView, 'schedule', [ + ('schedule///', None), + ('schedule//', dict(spider=None)), + ('schedule/', dict(version=None, spider=None)), + ('schedule', dict(project=None, version=None, spider=None)) + ]) + register_view(ScheduleCheckView, 'schedule.check', [('schedule/check', None)]) + register_view(ScheduleRunView, 'schedule.run', [('schedule/run', None)]) + register_view(ScheduleXhrView, 'schedule.xhr', [('schedule/xhr/', None)]) + register_view(ScheduleTaskView, 'schedule.task', [('schedule/task', None)]) + + from .views.operations.schedule import bp as bp_schedule_history + app.register_blueprint(bp_schedule_history) + + # Files + from .views.files.log import LogView + register_view(LogView, 'log', [('log////', None)]) + + from .views.files.logs import LogsView + register_view(LogsView, 'logs', [ + ('logs//', None), + ('logs/', dict(spider=None)), + ('logs', dict(project=None, spider=None)) + ]) + + from .views.files.items import ItemsView + register_view(ItemsView, 'items', [ + ('items//', None), + ('items/', dict(spider=None)), + ('items', dict(project=None, spider=None)) + ]) + + from .views.files.projects import ProjectsView + register_view(ProjectsView, 'projects', [ + ('projects///', None), + ('projects//', dict(version_spider_job=None)), + ('projects', dict(opt='listprojects', project=None, version_spider_job=None)) + ]) + + # Parse Log + from .views.utilities.parse import UploadLogView, UploadedLogView + register_view(UploadLogView, 'parse.upload', [('parse/upload', None)]) + register_view(UploadedLogView, 'parse.uploaded', [('parse/uploaded/', None)]) + + from .views.utilities.parse import bp as bp_parse_source + app.register_blueprint(bp_parse_source) + + # Send text + from .views.utilities.send_text import SendTextView, SendTextApiView + register_view(SendTextView, 'sendtext', [('sendtext', None)]) + register_view(SendTextApiView, 'sendtextapi', [ + ('slack//', dict(opt='slack')), + ('slack/', dict(opt='slack', channel_chatid_subject=None)), + ('slack', dict(opt='slack', channel_chatid_subject=None, text=None)), + ('telegram//', dict(opt='telegram')), + ('telegram/', dict(opt='telegram', channel_chatid_subject=None)), + ('telegram', dict(opt='telegram', channel_chatid_subject=None, text=None)), + ('tg//', dict(opt='tg')), + ('tg/', dict(opt='tg', channel_chatid_subject=None)), + ('tg', dict(opt='tg', channel_chatid_subject=None, text=None)), + ('email//', dict(opt='email')), + ('email/', dict(opt='email', channel_chatid_subject=None)), + ('email', dict(opt='email', channel_chatid_subject=None, text=None)), + ], with_node=False, trailing_slash=False) + + # System + from .views.system.settings import SettingsView + register_view(SettingsView, 'settings', [('settings', None)]) + + +def handle_template_context(app): + STATIC = 'static' + VERSION = 'v' + __version__.replace('.', '') + # MUST be commented out for released version + # VERSION = 'v131dev' + + @app.context_processor + def inject_variable(): + return dict( + CHECK_LATEST_VERSION_FREQ=100, + GITHUB_URL=__url__, + PYTHON_VERSION=PYTHON_VERSION, + SCRAPYDWEB_VERSION=__version__, + + # static_css_common=url_for(STATIC, filename='%s/css/common.css' % VERSION), + static_css_dropdown=url_for(STATIC, filename='%s/css/dropdown.css' % VERSION), + static_css_dropdown_mobileui=url_for(STATIC, filename='%s/css/dropdown_mobileui.css' % VERSION), + static_css_icon_upload_icon_right=url_for(STATIC, + filename='%s/css/icon_upload_icon_right.css' % VERSION), + static_css_multinode=url_for(STATIC, filename='%s/css/multinode.css' % VERSION), + static_css_stacktable=url_for(STATIC, filename='%s/css/stacktable.css' % VERSION), + static_css_stats=url_for(STATIC, filename='%s/css/stats.css' % VERSION), + static_css_style=url_for(STATIC, filename='%s/css/style.css' % VERSION), + static_css_style_mobileui=url_for(STATIC, filename='%s/css/style_mobileui.css' % VERSION), + static_css_utf8=url_for(STATIC, filename='%s/css/utf8.css' % VERSION), + static_css_utf8_mobileui=url_for(STATIC, filename='%s/css/utf8_mobileui.css' % VERSION), + + static_css_element_ui_index=url_for(STATIC, + filename='%s/element-ui@2.4.6/lib/theme-chalk/index.css' % VERSION), + static_js_element_ui_index=url_for(STATIC, filename='%s/element-ui@2.4.6/lib/index.js' % VERSION), + + static_js_common=url_for(STATIC, filename='%s/js/common.js' % VERSION), + static_js_echarts_min=url_for(STATIC, filename='%s/js/echarts.min.js' % VERSION), + static_js_icons_menu=url_for(STATIC, filename='%s/js/icons_menu.js' % VERSION), + # static_js_github_buttons_html=url_for(STATIC, filename='%s/js/github_buttons.html' % VERSION), + static_js_github_buttons=url_for(STATIC, filename='%s/js/github_buttons.js' % VERSION), + static_js_jquery_min=url_for(STATIC, filename='%s/js/jquery.min.js' % VERSION), + static_js_multinode=url_for(STATIC, filename='%s/js/multinode.js' % VERSION), + static_js_stacktable=url_for(STATIC, filename='%s/js/stacktable.js' % VERSION), + static_js_stats=url_for(STATIC, filename='%s/js/stats.js' % VERSION), + static_js_vue_min=url_for(STATIC, filename='%s/js/vue.min.js' % VERSION), + + static_icon=url_for(STATIC, filename='%s/icon/fav.ico' % VERSION), + static_icon_shortcut=url_for(STATIC, filename='%s/icon/fav.ico' % VERSION), + static_icon_apple_touch=url_for(STATIC, filename='%s/icon/spiderman.png' % VERSION), + ) diff --git a/__version__.py b/__version__.py new file mode 100644 index 00000000..4b798a53 --- /dev/null +++ b/__version__.py @@ -0,0 +1,10 @@ +# coding: utf-8 + +__title__ = 'scrapydweb' +__version__ = '1.4.0' +__author__ = 'my8100' +__author_email__ = 'my8100@gmail.com' +__url__ = 'https://github.com/my8100/scrapydweb' +__license__ = 'GNU General Public License v3.0' +__description__ = ("Web app for Scrapyd cluster management, " + "with support for Scrapy log analysis & visualization.") diff --git a/common.py b/common.py new file mode 100644 index 00000000..65925421 --- /dev/null +++ b/common.py @@ -0,0 +1,106 @@ +# coding: utf-8 +import json +import os +import re +import time +import traceback + +from flask import current_app as app +from flask import Response +import requests +from requests.adapters import HTTPAdapter +from w3lib.http import basic_auth_header + +from .__version__ import __version__ +from .models import Metadata, db + + +session = requests.Session() +session.mount('http://', HTTPAdapter(pool_connections=1000, pool_maxsize=1000)) +session.mount('https://', HTTPAdapter(pool_connections=1000, pool_maxsize=1000)) + + +# http://flask.pocoo.org/snippets/category/authentication/ +def authenticate(): + """Sends a 401 response that enables basic auth""" + return Response("", + 401, {'WWW-Authenticate': 'Basic realm="ScrapydWeb Basic Auth Required"'}) + + +def find_scrapydweb_settings_py(filename, path, prevpath=None): + if path == prevpath: + return '' + path = os.path.abspath(path) + cfgfile = os.path.join(path, filename) + if os.path.exists(cfgfile): + return cfgfile + # In vars.py, try to import module scrapydweb_settings_vN in cwd only + # return find_scrapydweb_settings_py(filename, os.path.dirname(path), path) + + +def get_now_string(allow_space=False): + if allow_space: + return time.strftime('%Y-%m-%d %H:%M:%S') + else: + return time.strftime('%Y-%m-%dT%H_%M_%S') + + +def get_response_from_view(url, auth=None, data=None, as_json=False): + # https://stackoverflow.com/a/21342070/10517783 How do I call one Flask view from another one? + # https://stackoverflow.com/a/30250045/10517783 + # python - Flask test_client() doesn't have request.authorization with pytest + client = app.test_client() + if auth is not None: + headers = {'Authorization': basic_auth_header(*auth)} + else: + headers = {} + if data is not None: + response = client.post(url, headers=headers, data=data, content_type='multipart/form-data') + # response = client.post(url, headers=headers, data=data, content_type='application/json') + else: + response = client.get(url, headers=headers) + + text = response.get_data(as_text=True) + if as_json: + # e.g. when used in schedule_task() + # 'node index error: %s, which should be between 1 and %s' % (self.node, self.SCRAPYD_SERVERS_AMOUNT) + # json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0) + try: + return json.loads(text) + except ValueError: + # '_status': '500 INTERNAL SERVER ERROR', + # '_status_code': 500, + # See 500.html + # Errornode index error: 2, which should be between 1 and 1 + #
Traceback...AssertionError: node index error: 2, which should be between 1 and 1 
+ m = re.search(r'Error(.+?)', text, re.S) + message = m.group(1) if m else text + return dict(status_code=getattr(response, '_status_code', 500), status='error', message=message) + else: + return text + + +def handle_metadata(key=None, value=None): + with db.app.app_context(): + metadata = Metadata.query.filter_by(version=__version__).first() + if key is None: + # '_sa_instance_state': , + return dict((k, v) for (k, v) in metadata.__dict__.items() if not k.startswith('_')) if metadata else {} + else: + try: + setattr(metadata, key, value) + db.session.commit() + except: + print(traceback.format_exc()) + db.session.rollback() + + +def handle_slash(string): + if not string: + return string + else: + return string.replace('\\', '/') + + +def json_dumps(obj, sort_keys=True, indent=4, ensure_ascii=False): + return json.dumps(obj, sort_keys=sort_keys, indent=indent, ensure_ascii=ensure_ascii) diff --git a/default_settings.py b/default_settings.py new file mode 100644 index 00000000..cc3442c3 --- /dev/null +++ b/default_settings.py @@ -0,0 +1,371 @@ +# coding: utf-8 +""" +How ScrapydWeb works: +BROWSER <<<>>> SCRAPYDWEB_BIND:SCRAPYDWEB_PORT <<<>>> your SCRAPYD_SERVERS + +GitHub: https://github.com/my8100/scrapydweb +DOCS: https://github.com/my8100/files/blob/master/scrapydweb/README.md +文档:https://github.com/my8100/files/blob/master/scrapydweb/README_CN.md +""" +import os + + +############################## QUICK SETUP start ############################## +############################## 快速设置 开始 ################################### +# Setting SCRAPYDWEB_BIND to '0.0.0.0' or IP-OF-THE-CURRENT-HOST would make +# ScrapydWeb server visible externally; Otherwise, set it to '127.0.0.1'. +# The default is '0.0.0.0'. +SCRAPYDWEB_BIND = "0.0.0.0" +# Accept connections on the specified port, the default is 5000. +SCRAPYDWEB_PORT = 5000 + +# The default is False, set it to True to enable basic auth for the web UI. +ENABLE_AUTH = False +# In order to enable basic auth, both USERNAME and PASSWORD should be non-empty strings. +USERNAME = "" +PASSWORD = "" + + +# Make sure that [Scrapyd](https://github.com/scrapy/scrapyd) has been installed +# and started on all of your hosts. +# Note that for remote access, you have to manually set 'bind_address = 0.0.0.0' +# in the configuration file of Scrapyd and restart Scrapyd to make it visible externally. +# Check out 'https://scrapyd.readthedocs.io/en/latest/config.html#example-configuration-file' for more info. +# ------------------------------ Chinese -------------------------------------- +# 请先确保所有主机都已经安装和启动 [Scrapyd](https://github.com/scrapy/scrapyd)。 +# 如需远程访问 Scrapyd,则需在 Scrapyd 配置文件中设置 'bind_address = 0.0.0.0',然后重启 Scrapyd。 +# 详见 https://scrapyd.readthedocs.io/en/latest/config.html#example-configuration-file + +# - the string format: username:password@ip:port#group +# - The default port would be 6800 if not provided, +# - Both basic auth and group are optional. +# - e.g. '127.0.0.1:6800' or 'username:password@localhost:6801#group' +# - the tuple format: (username, password, ip, port, group) +# - When the username, password, or group is too complicated (e.g. contains ':@#'), +# - or if ScrapydWeb fails to parse the string format passed in, +# - it's recommended to pass in a tuple of 5 elements. +# - e.g. ('', '', '127.0.0.1', '6800', '') or ('username', 'password', 'localhost', '6801', 'group') +SCRAPYD_SERVERS = [ + "127.0.0.1:6800", + # 'username:password@localhost:6801#group', + ("username", "password", "localhost", "6801", "group"), +] + + +# It's recommended to update the three options below +# if both ScrapydWeb and one of your Scrapyd servers run on the same machine. +# ------------------------------ Chinese -------------------------------------- +# 假如 ScrapydWeb 和某个 Scrapyd 运行于同一台主机,建议更新如下三个设置项。 + +# If both ScrapydWeb and one of your Scrapyd servers run on the same machine, +# ScrapydWeb would try to directly read Scrapy logfiles from disk, instead of making a request +# to the Scrapyd server. +# e.g. '127.0.0.1:6800' or 'localhost:6801', do not forget the port number. +LOCAL_SCRAPYD_SERVER = "" + +# Enter the directory when you run Scrapyd, run the command below +# to find out where the Scrapy logs are stored: +# python -c "from os.path import abspath, isdir; from scrapyd.config import Config; path = abspath(Config().get('logs_dir')); print(path); print(isdir(path))" +# Check out https://scrapyd.readthedocs.io/en/stable/config.html#logs-dir for more info. +# e.g. 'C:/Users/username/logs' or '/home/username/logs' +LOCAL_SCRAPYD_LOGS_DIR = "" + +# The default is False, set it to True to automatically run LogParser as a subprocess at startup. +# Note that you can run the LogParser service separately via command 'logparser' as you like. +# Run 'logparser -h' to find out the config file of LogParser for more advanced settings. +# Visit https://github.com/my8100/logparser for more info. +ENABLE_LOGPARSER = False +############################## QUICK SETUP end ################################ +############################## 快速设置 结束 ################################### + + +############################## ScrapydWeb ##################################### +# The default is False, set it to True and add both CERTIFICATE_FILEPATH and PRIVATEKEY_FILEPATH +# to run ScrapydWeb in HTTPS mode. +# Note that this feature is not fully tested, please leave your comment here if ScrapydWeb +# raises any excepion at startup: https://github.com/my8100/scrapydweb/issues/18 +ENABLE_HTTPS = False +# e.g. '/home/username/cert.pem' +CERTIFICATE_FILEPATH = "" +# e.g. '/home/username/cert.key' +PRIVATEKEY_FILEPATH = "" + + +############################## Scrapy ######################################### +# ScrapydWeb is able to locate projects in the SCRAPY_PROJECTS_DIR, +# so that you can simply select a project to deploy, instead of packaging it in advance. +# e.g. 'C:/Users/username/myprojects' or '/home/username/myprojects' +SCRAPY_PROJECTS_DIR = "" +# Set the maximum number of hours that a scraper can run for. +MAX_HOURS = 5 +# Set 0 to disable auto-stopping of long running scrapers +LONG_RUNNING_SCRAPER_STOP_INTERVAL = 1 + +############################## Scrapyd ######################################## +# ScrapydWeb would try every extension in sequence to locate the Scrapy logfile. +# The default is ['.log', '.log.gz', '.txt']. +SCRAPYD_LOG_EXTENSIONS = [".log", ".log.gz", ".txt"] + +# The default is None, only set it up when you need to visit Scrapyd servers via reverse proxy. +# Make sure that SCRAPYD_SERVERS_PUBLIC_URLS has same length with SCRAPYD_SERVERS above. +# e.g. +# SCRAPYD_SERVERS_PUBLIC_URLS = [ +# 'https://a.b.com', # visit the first Scrapyd server via reverse proxy. +# '', # visit the second Scrapyd server without reverse proxy. +# ] +# See https://github.com/my8100/scrapydweb/issues/94 for more info. +SCRAPYD_SERVERS_PUBLIC_URLS = None + + +############################## LogParser ###################################### +# Whether to backup the stats json files locally after you visit the Stats page of a job +# so that it is still accessible even if the original logfile has been deleted. +# The default is True, set it to False to disable this behaviour. +BACKUP_STATS_JSON_FILE = True + + +############################## Timer Tasks #################################### +# Run ScrapydWeb with argument '-sw' or '--switch_scheduler_state', or click the ENABLED|DISABLED button +# on the Timer Tasks page to turn on/off the scheduler for the timer tasks and the snapshot mechanism below. + +# The default is 300, which means ScrapydWeb would automatically create a snapshot of the Jobs page +# and save the jobs info in the database in the background every 300 seconds. +# Note that this behaviour would be paused if the scheduler for timer tasks is disabled. +# Set it to 0 to disable this behaviour. +JOBS_SNAPSHOT_INTERVAL = 300 + + +############################## Run Spider ##################################### +# The default is False, set it to True to automatically +# expand the 'settings & arguments' section in the Run Spider page. +SCHEDULE_EXPAND_SETTINGS_ARGUMENTS = False + +# The default is 'Mozilla/5.0', set it a non-empty string to customize the default value of `custom` +# in the drop-down list of `USER_AGENT`. +SCHEDULE_CUSTOM_USER_AGENT = "Mozilla/5.0" + +# The default is None, set it to any value of ['custom', 'Chrome', 'iPhone', 'iPad', 'Android'] +# to customize the default value of `USER_AGENT`. +SCHEDULE_USER_AGENT = None + +# The default is None, set it to True or False to customize the default value of `ROBOTSTXT_OBEY`. +SCHEDULE_ROBOTSTXT_OBEY = None + +# The default is None, set it to True or False to customize the default value of `COOKIES_ENABLED`. +SCHEDULE_COOKIES_ENABLED = None + +# The default is None, set it to a non-negative integer to customize the default value of `CONCURRENT_REQUESTS`. +SCHEDULE_CONCURRENT_REQUESTS = None + +# The default is None, set it to a non-negative number to customize the default value of `DOWNLOAD_DELAY`. +SCHEDULE_DOWNLOAD_DELAY = None + +# The default is "-d setting=CLOSESPIDER_TIMEOUT=60\r\n-d setting=CLOSESPIDER_PAGECOUNT=10\r\n-d arg1=val1", +# set it to '' or any non-empty string to customize the default value of `additional`. +# Use '\r\n' as the line separator. +SCHEDULE_ADDITIONAL = "-d setting=CLOSESPIDER_TIMEOUT=60\r\n-d setting=CLOSESPIDER_PAGECOUNT=10\r\n-d arg1=val1" + + +############################## Page Display ################################### +# The default is True, set it to False to hide the Items page, as well as +# the Items column in the Jobs page. +SHOW_SCRAPYD_ITEMS = True + +# The default is True, set it to False to hide the Job column in the Jobs page with non-database view. +SHOW_JOBS_JOB_COLUMN = True + +# The default is 0, which means unlimited, set it to a positive integer so that +# only the latest N finished jobs would be shown in the Jobs page with non-database view. +JOBS_FINISHED_JOBS_LIMIT = 0 + +# If your browser stays on the Jobs page, it would be reloaded automatically every N seconds. +# The default is 300, set it to 0 to disable auto-reloading. +JOBS_RELOAD_INTERVAL = 300 + +# The load status of the current Scrapyd server is checked every N seconds, +# which is displayed in the top right corner of the page. +# The default is 10, set it to 0 to disable auto-refreshing. +DAEMONSTATUS_REFRESH_INTERVAL = 10 + + +############################## Send Text ###################################### +########## usage in scrapy projects ########## +# See the "Send Text" page + +########## slack ########## +# How to create a slack app: +# 1. Visit https://api.slack.com/apps and press the "Create New App" button. +# 2. Enter your App Name (e.g. myapp)and select one of your Slack Workspaces, the press "Create App". +# 3. Click the "OAuth & Permissions" menu in the sidebar on the left side of the page. +# 4. Scroll down the page and find out "Select Permission Scopes" in the "Scopes" section +# 5. Enter "send" and select "Send messages as ", then press "Save Changes" +# 6. Scroll up the page and press "Install App to Workspace", then press "Install" +# 7. Copy the "OAuth Access Token", e.g. xoxp-123-456-789-abcde +# See https://api.slack.com/apps for more info + +# See step 1~7 above, e.g. 'xoxp-123-456-789-abcde' +SLACK_TOKEN = os.environ.get("SLACK_TOKEN", "") +# The default channel to use when sending text via slack, e.g. 'general' +SLACK_CHANNEL = "general" + +########## telegram ########## +# How to create a telegram bot: +# 1. Visit https://telegram.me/botfather to start a conversation with Telegram's bot that creates other bots. +# 2. Send the /newbot command to create a new bot in a chat with BotFather. +# 3. Follow the instructions to set up name and username (e.g. my_bot) for your bot. +# 4. You would get a token (e.g. 123:abcde) after step 3. +# 5. Visit telegram.me/ (e.g. telegram.me/my_bot) and say hi to your bot to initiate a conversation. +# 6. Visit https://api.telegram.org/bot/getUpdates to get the chat_id. +# (e.g. Visit https://api.telegram.org/bot123:abcde/getUpdates +# and you can find the chat_id in "chat":{"id":123456789,...) +# See https://core.telegram.org/bots#6-botfather for more info + +# See step 1~4 above, e.g. '123:abcde' +TELEGRAM_TOKEN = os.environ.get("TELEGRAM_TOKEN", "") +# See step 5~6 above, e.g. 123456789 +TELEGRAM_CHAT_ID = int(os.environ.get("TELEGRAM_CHAT_ID", 0)) + +########## email ########## +# The default subject to use when sending text via email. +EMAIL_SUBJECT = "Email from #scrapydweb" + +########## email sender & recipients ########## +# Leave this option as '' to default to the EMAIL_SENDER option below; Otherwise, set it up +# if your email service provider requires an username which is different from the EMAIL_SENDER option below to login. +# e.g. 'username' +EMAIL_USERNAME = "" +# As for different email service provider, you might have to get an APP password (like Gmail) +# or an authorization code (like QQ mail) and set it as the EMAIL_PASSWORD. +# Check out links below to get more help: +# https://stackoverflow.com/a/27515833/10517783 How to send an email with Gmail as the provider using Python? +# https://stackoverflow.com/a/26053352/10517783 Python smtplib proxy support +# e.g. 'password4gmail' +EMAIL_PASSWORD = os.environ.get("EMAIL_PASSWORD", "") + +# e.g. 'username@gmail.com' +EMAIL_SENDER = "" +# e.g. ['username@gmail.com', ] +EMAIL_RECIPIENTS = [EMAIL_SENDER] + +########## email smtp settings ########## +# Check out this link if you are using ECS of Alibaba Cloud and your SMTP server provides TCP port 25 only: +# https://www.alibabacloud.com/help/doc-detail/56130.htm +# Config for https://mail.google.com using SSL: ('smtp.gmail.com', 465, True) +# Config for https://mail.google.com: ('smtp.gmail.com', 587, False) +# Config for https://mail.qq.com using SSL: ('smtp.qq.com', 465, True) +# Config for http://mail.10086.cn: ('smtp.139.com', 25, False) +SMTP_SERVER = "" +SMTP_PORT = 0 +SMTP_OVER_SSL = False +# The timeout in seconds for the connection attempt, the default is 30. +SMTP_CONNECTION_TIMEOUT = 30 + + +############################## Monitor & Alert ################################ +# The default is False, set it to True to launch the poll subprocess to monitor your crawling jobs. +ENABLE_MONITOR = False + +########## poll interval ########## +# Tip: In order to be notified (and stop or forcestop a job when triggered) in time, +# you can reduce the value of POLL_ROUND_INTERVAL and POLL_REQUEST_INTERVAL, +# at the cost of burdening both CPU and bandwidth of your servers. + +# Sleep N seconds before starting next round of poll, the default is 300. +POLL_ROUND_INTERVAL = 300 +# Sleep N seconds between each request to the Scrapyd server while polling, the default is 10. +POLL_REQUEST_INTERVAL = 10 + +########## alert switcher ########## +# Tip: Set the SCRAPYDWEB_BIND option the in "QUICK SETUP" section to the actual IP of your host, +# then you can visit ScrapydWeb via the links attached in the alert. + +# The default is False, set it to True to enable alert via Slack, Telegram, or Email. +# You have to set up your accounts in the "Send text" section above first. +ENABLE_SLACK_ALERT = False +ENABLE_TELEGRAM_ALERT = False +ENABLE_EMAIL_ALERT = False + +########## alert working time ########## +# Monday is 1 and Sunday is 7. +# e.g, [1, 2, 3, 4, 5, 6, 7] +ALERT_WORKING_DAYS = [] + +# From 0 to 23. +# e.g. [9] + list(range(15, 18)) >>> [9, 15, 16, 17], or range(24) for 24 hours +ALERT_WORKING_HOURS = [] + +########## basic triggers ########## +# Trigger alert every N seconds for each running job. +# The default is 0, set it to a positive integer to enable this trigger. +ON_JOB_RUNNING_INTERVAL = 0 + +# Trigger alert when a job is finished. +# The default is False, set it to True to enable this trigger. +ON_JOB_FINISHED = False + +########## advanced triggers ########## +# - LOG_XXX_THRESHOLD: +# - Trigger alert the first time reaching the threshold for a specific kind of log. +# - The default is 0, set it to a positive integer to enable this trigger. +# - LOG_XXX_TRIGGER_STOP (optional): +# - The default is False, set it to True to stop current job automatically when reaching the LOG_XXX_THRESHOLD. +# - The SIGTERM signal would be sent only one time to shut down the crawler gracefully. +# - In order to avoid an UNCLEAN shutdown, the 'STOP' action would be executed one time at most +# - if none of the 'FORCESTOP' triggers is enabled, no matter how many 'STOP' triggers are enabled. +# - LOG_XXX_TRIGGER_FORCESTOP (optional): +# - The default is False, set it to True to FORCESTOP current job automatically when reaching the LOG_XXX_THRESHOLD. +# - The SIGTERM signal would be sent twice resulting in an UNCLEAN shutdown, without the Scrapy stats dumped! +# - The 'FORCESTOP' action would be executed if both of the 'STOP' and 'FORCESTOP' triggers are enabled. + +# Note that the 'STOP' action and the 'FORCESTOP' action would still be executed even when the current time +# is NOT within the ALERT_WORKING_DAYS and the ALERT_WORKING_HOURS, though no alert would be sent. + +LOG_CRITICAL_THRESHOLD = 0 +LOG_CRITICAL_TRIGGER_STOP = False +LOG_CRITICAL_TRIGGER_FORCESTOP = False + +LOG_ERROR_THRESHOLD = 0 +LOG_ERROR_TRIGGER_STOP = False +LOG_ERROR_TRIGGER_FORCESTOP = False + +LOG_WARNING_THRESHOLD = 0 +LOG_WARNING_TRIGGER_STOP = False +LOG_WARNING_TRIGGER_FORCESTOP = False + +LOG_REDIRECT_THRESHOLD = 0 +LOG_REDIRECT_TRIGGER_STOP = False +LOG_REDIRECT_TRIGGER_FORCESTOP = False + +LOG_RETRY_THRESHOLD = 0 +LOG_RETRY_TRIGGER_STOP = False +LOG_RETRY_TRIGGER_FORCESTOP = False + +LOG_IGNORE_THRESHOLD = 0 +LOG_IGNORE_TRIGGER_STOP = False +LOG_IGNORE_TRIGGER_FORCESTOP = False + + +############################## System ######################################### +# The default is False, set it to True to enable debug mode and the interactive debugger +# would be shown in the browser instead of the "500 Internal Server Error" page. +# Note that use_reloader is set to False in run.py +DEBUG = False + +# The default is False, set it to True to change the logging level from INFO to DEBUG +# for getting more information about how ScrapydWeb works, especially while debugging. +VERBOSE = False + +# The default is '', which means saving all program data in the Python directory. +# e.g. 'C:/Users/username/scrapydweb_data' or '/home/username/scrapydweb_data' +DATA_PATH = os.environ.get("DATA_PATH", "") + +# The default is '', which means saving data of Jobs and Timer Tasks in DATA_PATH using SQLite. +# The data could be also saved in MySQL or PostgreSQL backend in order to improve concurrency. +# To use MySQL backend, run command: pip install --upgrade pymysql +# To use PostgreSQL backend, run command: pip install --upgrade psycopg2 +# e.g. +# 'mysql://username:password@127.0.0.1:3306' +# 'postgres://username:password@127.0.0.1:5432' +# 'sqlite:///C:/Users/username' +# 'sqlite:////home/username' +DATABASE_URL = os.environ.get("DATABASE_URL", "") diff --git a/models.py b/models.py new file mode 100644 index 00000000..76a0564d --- /dev/null +++ b/models.py @@ -0,0 +1,179 @@ +# coding: utf-8 +from datetime import datetime +from pprint import pformat +import time + +from flask_sqlalchemy import SQLAlchemy + +from .vars import STATE_RUNNING + + +db = SQLAlchemy(session_options=dict(autocommit=False, autoflush=True)) + + +# TODO: Database Migrations https://blog.miguelgrinberg.com/post/the-flask-mega-tutorial-part-iv-database +# http://flask-sqlalchemy.pocoo.org/2.3/binds/#binds +class Metadata(db.Model): + __tablename__ = 'metadata' + __bind_key__ = 'metadata' + + id = db.Column(db.Integer, primary_key=True) + version = db.Column(db.String(20), unique=True, nullable=False) + last_check_update_timestamp = db.Column(db.Float, unique=False, default=time.time) + main_pid = db.Column(db.Integer, unique=False, nullable=True) + logparser_pid = db.Column(db.Integer, unique=False, nullable=True) + poll_pid = db.Column(db.Integer, unique=False, nullable=True) + pageview = db.Column(db.Integer, unique=False, nullable=False, default=0) + url_scrapydweb = db.Column(db.Text(), unique=False, nullable=False, default='http://127.0.0.1:5000') + url_jobs = db.Column(db.String(255), unique=False, nullable=False, default='/1/jobs/') + url_schedule_task = db.Column(db.String(255), unique=False, nullable=False, default='/1/schedule/task/') + url_delete_task_result = db.Column(db.String(255), unique=False, nullable=False, default='/1/tasks/xhr/delete/1/1/') + username = db.Column(db.String(255), unique=False, nullable=True) + password = db.Column(db.String(255), unique=False, nullable=True) + scheduler_state = db.Column(db.Integer, unique=False, nullable=False, default=STATE_RUNNING) + jobs_per_page = db.Column(db.Integer, unique=False, nullable=False, default=100) + tasks_per_page = db.Column(db.Integer, unique=False, nullable=False, default=100) + jobs_style = db.Column(db.String(8), unique=False, nullable=False, default='database') # 'classic' + + def __repr__(self): + return pformat(vars(self)) + + +# TODO: Timezone Conversions https://blog.miguelgrinberg.com/post/the-flask-mega-tutorial-part-xii-dates-and-times +def create_jobs_table(server): + class Job(db.Model): + __tablename__ = server + __bind_key__ = 'jobs' + # https://stackoverflow.com/questions/10059345/sqlalchemy-unique-across-multiple-columns + # https://stackoverflow.com/questions/43975349/why-uniqueconstraint-doesnt-work-in-flask-sqlalchemy + __table_args__ = (db.UniqueConstraint('project', 'spider', 'job'), ) + + id = db.Column(db.Integer, primary_key=True) + project = db.Column(db.String(255), unique=False, nullable=False) # Pending + spider = db.Column(db.String(255), unique=False, nullable=False) # Pending + job = db.Column(db.String(255), unique=False, nullable=False) # Pending + status = db.Column(db.String(1), unique=False, nullable=False, index=True) # Pending 0, Running 1, Finished 2 + deleted = db.Column(db.String(1), unique=False, nullable=False, default='0', index=True) + create_time = db.Column(db.DateTime, unique=False, nullable=False, default=datetime.now) + update_time = db.Column(db.DateTime, unique=False, nullable=False, default=datetime.now) + + pages = db.Column(db.Integer, unique=False, nullable=True) + items = db.Column(db.Integer, unique=False, nullable=True) + pid = db.Column(db.Integer, unique=False, nullable=True) # Running + start = db.Column(db.DateTime, unique=False, nullable=True, index=True) + runtime = db.Column(db.String(20), unique=False, nullable=True) + finish = db.Column(db.DateTime, unique=False, nullable=True, index=True) # Finished + href_log = db.Column(db.Text(), unique=False, nullable=True) + href_items = db.Column(db.Text(), unique=False, nullable=True) + + def __repr__(self): + return "" % ( + self.id, self.__tablename__, self.project, self.spider, self.job, self.start) + + return Job + # sqlalchemy/ext/declarative/clsregistry.py:128: SAWarning: This declarative base already contains a class + # with the same class name and module name as scrapydweb.models.Job, + # and will be replaced in the string-lookup table. + # https://stackoverflow.com/questions/27773489/dynamically-create-a-python-subclass-in-a-function + # return type('Job_%s' % server, (Job, ), dict(__tablename__=server, __bind_key__='jobs')) + +# print(dir([create_table(s) for s in 'abc'][0])) + + +# http://flask-sqlalchemy.pocoo.org/2.3/models/ One-to-Many Relationships +# https://techarena51.com/blog/one-to-many-relationships-with-flask-sqlalchemy/ +# https://docs.sqlalchemy.org/en/latest/orm/cascades.html#delete-orphan +# https://docs.sqlalchemy.org/en/latest/core/constraints.html#indexes +# https://stackoverflow.com/questions/14419299/adding-indexes-to-sqlalchemy-models-after-table-creation +# https://stackoverflow.com/questions/8890738/sqlalchemy-does-column-with-foreignkey-creates-index-automatically +class Task(db.Model): + __tablename__ = 'task' + + id = db.Column(db.Integer, primary_key=True) + name = db.Column(db.String(255), unique=False, nullable=True) # None + trigger = db.Column(db.String(8), unique=False, nullable=False) # cron, interval, date + create_time = db.Column(db.DateTime, unique=False, nullable=False, default=datetime.now) # datetime.utcnow + update_time = db.Column(db.DateTime, unique=False, nullable=False, default=datetime.now) + + project = db.Column(db.String(255), unique=False, nullable=False) + version = db.Column(db.String(255), unique=False, nullable=False) + spider = db.Column(db.String(255), unique=False, nullable=False) + jobid = db.Column(db.String(255), unique=False, nullable=False) + settings_arguments = db.Column(db.Text(), unique=False, nullable=False) + selected_nodes = db.Column(db.Text(), unique=False, nullable=False) + + year = db.Column(db.String(255), unique=False, nullable=False) + month = db.Column(db.String(255), unique=False, nullable=False) + day = db.Column(db.String(255), unique=False, nullable=False) + week = db.Column(db.String(255), unique=False, nullable=False) + day_of_week = db.Column(db.String(255), unique=False, nullable=False) + hour = db.Column(db.String(255), unique=False, nullable=False) + minute = db.Column(db.String(255), unique=False, nullable=False) + second = db.Column(db.String(255), unique=False, nullable=False) + + start_date = db.Column(db.String(19), unique=False, nullable=True) # '2019-01-01 00:00:01' None + end_date = db.Column(db.String(19), unique=False, nullable=True) # '2019-01-01 00:00:01' None + + timezone = db.Column(db.String(255), unique=False, nullable=True) # None + jitter = db.Column(db.Integer, unique=False, nullable=False) # int + misfire_grace_time = db.Column(db.Integer, unique=False, nullable=True) # None|a positive integer + coalesce = db.Column(db.String(5), unique=False, nullable=False) # 'True'|'False' + max_instances = db.Column(db.Integer, unique=False, nullable=False) # int + + results = db.relationship('TaskResult', backref='task', cascade='all, delete-orphan', lazy=True) + + def __repr__(self): + return "" % ( + self.id, self.name, self.project, self.version, self.spider, self.jobid, + self.create_time, self.update_time) + + +class TaskResult(db.Model): + __tablename__ = 'task_result' + + id = db.Column(db.Integer, primary_key=True) + task_id = db.Column(db.Integer, db.ForeignKey('task.id'), nullable=False, index=True) + execute_time = db.Column(db.DateTime, unique=False, nullable=False, default=datetime.now) + fail_count = db.Column(db.Integer, unique=False, nullable=False, default=0) + pass_count = db.Column(db.Integer, unique=False, nullable=False, default=0) + + results = db.relationship('TaskJobResult', backref='task_result', cascade='all, delete-orphan', lazy=True) + + def __repr__(self): + return "" % ( + self.id, self.task_id, self.task.name, self.fail_count, self.pass_count, self.execute_time) + + +class TaskJobResult(db.Model): + __tablename__ = 'task_job_result' + + id = db.Column(db.Integer, primary_key=True) + task_result_id = db.Column(db.Integer, db.ForeignKey('task_result.id'), nullable=False, index=True) + run_time = db.Column(db.DateTime, unique=False, nullable=False, default=datetime.now) + node = db.Column(db.Integer, unique=False, nullable=False, index=True) + server = db.Column(db.String(255), unique=False, nullable=False) # '127.0.0.1:6800' + status_code = db.Column(db.Integer, unique=False, nullable=False) # -1, 200 + status = db.Column(db.String(9), unique=False, nullable=False) # ok|error|exception + # psycopg2.DataError) value too long for type character varying(1000) + # https://docs.sqlalchemy.org/en/latest/core/type_basics.html#sqlalchemy.types.Text + # In general, TEXT objects do not have a length + result = db.Column(db.Text(), unique=False, nullable=False) # jobid|message|exception + + def __repr__(self): + kwargs = dict( + task_id=self.task_result.task_id, + task_name=self.task_result.task.name, + project=self.task_result.task.project, + version=self.task_result.task.version, + spider=self.task_result.task.spider, + jobid=self.task_result.task.jobid, + run_time=str(self.run_time), # TypeError: Object of type datetime is not JSON serializable + node=self.node, + server=self.server, + status_code=self.status_code, + status=self.status, + result=self.result, + task_result_id=self.task_result_id, + id=self.id, + ) + return '' % pformat(kwargs, indent=4) diff --git a/scrapydweb_settings_v10.py b/scrapydweb_settings_v10.py new file mode 100644 index 00000000..dc68d69a --- /dev/null +++ b/scrapydweb_settings_v10.py @@ -0,0 +1,361 @@ +# coding: utf-8 +""" +How ScrapydWeb works: +BROWSER <<<>>> SCRAPYDWEB_BIND:SCRAPYDWEB_PORT <<<>>> your SCRAPYD_SERVERS + +GitHub: https://github.com/my8100/scrapydweb +DOCS: https://github.com/my8100/files/blob/master/scrapydweb/README.md +文档:https://github.com/my8100/files/blob/master/scrapydweb/README_CN.md +""" +import os + + +############################## QUICK SETUP start ############################## +############################## 快速设置 开始 ################################### +# Setting SCRAPYDWEB_BIND to '0.0.0.0' or IP-OF-THE-CURRENT-HOST would make +# ScrapydWeb server visible externally; Otherwise, set it to '127.0.0.1'. +# The default is '0.0.0.0'. +SCRAPYDWEB_BIND = "0.0.0.0" +# Accept connections on the specified port, the default is 5000. +SCRAPYDWEB_PORT = 5000 + +# The default is False, set it to True to enable basic auth for the web UI. +ENABLE_AUTH = False +# In order to enable basic auth, both USERNAME and PASSWORD should be non-empty strings. +USERNAME = "" +PASSWORD = "" + + +# Make sure that [Scrapyd](https://github.com/scrapy/scrapyd) has been installed +# and started on all of your hosts. +# Note that for remote access, you have to manually set 'bind_address = 0.0.0.0' +# in the configuration file of Scrapyd and restart Scrapyd to make it visible externally. +# Check out 'https://scrapyd.readthedocs.io/en/latest/config.html#example-configuration-file' for more info. +# ------------------------------ Chinese -------------------------------------- +# 请先确保所有主机都已经安装和启动 [Scrapyd](https://github.com/scrapy/scrapyd)。 +# 如需远程访问 Scrapyd,则需在 Scrapyd 配置文件中设置 'bind_address = 0.0.0.0',然后重启 Scrapyd。 +# 详见 https://scrapyd.readthedocs.io/en/latest/config.html#example-configuration-file + +# - the string format: username:password@ip:port#group +# - The default port would be 6800 if not provided, +# - Both basic auth and group are optional. +# - e.g. '127.0.0.1:6800' or 'username:password@localhost:6801#group' +# - the tuple format: (username, password, ip, port, group) +# - When the username, password, or group is too complicated (e.g. contains ':@#'), +# - or if ScrapydWeb fails to parse the string format passed in, +# - it's recommended to pass in a tuple of 5 elements. +# - e.g. ('', '', '127.0.0.1', '6800', '') or ('username', 'password', 'localhost', '6801', 'group') +SCRAPYD_SERVERS = [ + "127.0.0.1:6800", + # 'username:password@localhost:6801#group', + ("username", "password", "localhost", "6801", "group"), +] + + +# It's recommended to update the three options below +# if both ScrapydWeb and one of your Scrapyd servers run on the same machine. +# ------------------------------ Chinese -------------------------------------- +# 假如 ScrapydWeb 和某个 Scrapyd 运行于同一台主机,建议更新如下三个设置项。 + +# If both ScrapydWeb and one of your Scrapyd servers run on the same machine, +# ScrapydWeb would try to directly read Scrapy logfiles from disk, instead of making a request +# to the Scrapyd server. +# e.g. '127.0.0.1:6800' or 'localhost:6801', do not forget the port number. +LOCAL_SCRAPYD_SERVER = "" + +# Enter the directory when you run Scrapyd, run the command below +# to find out where the Scrapy logs are stored: +# python -c "from os.path import abspath, isdir; from scrapyd.config import Config; path = abspath(Config().get('logs_dir')); print(path); print(isdir(path))" +# Check out https://scrapyd.readthedocs.io/en/stable/config.html#logs-dir for more info. +# e.g. 'C:/Users/username/logs' or '/home/username/logs' +LOCAL_SCRAPYD_LOGS_DIR = "" + +# The default is False, set it to True to automatically run LogParser as a subprocess at startup. +# Note that you can run the LogParser service separately via command 'logparser' as you like. +# Run 'logparser -h' to find out the config file of LogParser for more advanced settings. +# Visit https://github.com/my8100/logparser for more info. +ENABLE_LOGPARSER = False +############################## QUICK SETUP end ################################ +############################## 快速设置 结束 ################################### + + +############################## ScrapydWeb ##################################### +# The default is False, set it to True and add both CERTIFICATE_FILEPATH and PRIVATEKEY_FILEPATH +# to run ScrapydWeb in HTTPS mode. +# Note that this feature is not fully tested, please leave your comment here if ScrapydWeb +# raises any excepion at startup: https://github.com/my8100/scrapydweb/issues/18 +ENABLE_HTTPS = False +# e.g. '/home/username/cert.pem' +CERTIFICATE_FILEPATH = "" +# e.g. '/home/username/cert.key' +PRIVATEKEY_FILEPATH = "" + + +############################## Scrapy ######################################### +# ScrapydWeb is able to locate projects in the SCRAPY_PROJECTS_DIR, +# so that you can simply select a project to deploy, instead of packaging it in advance. +# e.g. 'C:/Users/username/myprojects' or '/home/username/myprojects' +SCRAPY_PROJECTS_DIR = "" +# Set the maximum number of hours that a scraper can run for. +MAX_HOURS = 5 +# Set 0 to disable auto-stopping of long running scrapers +LONG_RUNNING_SCRAPER_STOP_INTERVAL = 10 + +############################## Scrapyd ######################################## +# ScrapydWeb would try every extension in sequence to locate the Scrapy logfile. +# The default is ['.log', '.log.gz', '.txt']. +SCRAPYD_LOG_EXTENSIONS = [".log", ".log.gz", ".txt"] + + +############################## LogParser ###################################### +# Whether to backup the stats json files locally after you visit the Stats page of a job +# so that it is still accessible even if the original logfile has been deleted. +# The default is True, set it to False to disable this behaviour. +BACKUP_STATS_JSON_FILE = True + + +############################## Timer Tasks #################################### +# Run ScrapydWeb with argument '-sw' or '--switch_scheduler_state', or click the ENABLED|DISABLED button +# on the Timer Tasks page to turn on/off the scheduler for the timer tasks and the snapshot mechanism below. + +# The default is 300, which means ScrapydWeb would automatically create a snapshot of the Jobs page +# and save the jobs info in the database in the background every 300 seconds. +# Note that this behaviour would be paused if the scheduler for timer tasks is disabled. +# Set it to 0 to disable this behaviour. +JOBS_SNAPSHOT_INTERVAL = 300 + + +############################## Run Spider ##################################### +# The default is False, set it to True to automatically +# expand the 'settings & arguments' section in the Run Spider page. +SCHEDULE_EXPAND_SETTINGS_ARGUMENTS = False + +# The default is 'Mozilla/5.0', set it a non-empty string to customize the default value of `custom` +# in the drop-down list of `USER_AGENT`. +SCHEDULE_CUSTOM_USER_AGENT = "Mozilla/5.0" + +# The default is None, set it to any value of ['custom', 'Chrome', 'iPhone', 'iPad', 'Android'] +# to customize the default value of `USER_AGENT`. +SCHEDULE_USER_AGENT = None + +# The default is None, set it to True or False to customize the default value of `ROBOTSTXT_OBEY`. +SCHEDULE_ROBOTSTXT_OBEY = None + +# The default is None, set it to True or False to customize the default value of `COOKIES_ENABLED`. +SCHEDULE_COOKIES_ENABLED = None + +# The default is None, set it to a non-negative integer to customize the default value of `CONCURRENT_REQUESTS`. +SCHEDULE_CONCURRENT_REQUESTS = None + +# The default is None, set it to a non-negative number to customize the default value of `DOWNLOAD_DELAY`. +SCHEDULE_DOWNLOAD_DELAY = None + +# The default is "-d setting=CLOSESPIDER_TIMEOUT=60\r\n-d setting=CLOSESPIDER_PAGECOUNT=10\r\n-d arg1=val1", +# set it to '' or any non-empty string to customize the default value of `additional`. +# Use '\r\n' as the line separator. +SCHEDULE_ADDITIONAL = "-d setting=CLOSESPIDER_TIMEOUT=60\r\n-d setting=CLOSESPIDER_PAGECOUNT=10\r\n-d arg1=val1" + + +############################## Page Display ################################### +# The default is True, set it to False to hide the Items page, as well as +# the Items column in the Jobs page. +SHOW_SCRAPYD_ITEMS = True + +# The default is True, set it to False to hide the Job column in the Jobs page with non-database view. +SHOW_JOBS_JOB_COLUMN = True + +# The default is 0, which means unlimited, set it to a positive integer so that +# only the latest N finished jobs would be shown in the Jobs page with non-database view. +JOBS_FINISHED_JOBS_LIMIT = 0 + +# If your browser stays on the Jobs page, it would be reloaded automatically every N seconds. +# The default is 300, set it to 0 to disable auto-reloading. +JOBS_RELOAD_INTERVAL = 300 + +# The load status of the current Scrapyd server is checked every N seconds, +# which is displayed in the top right corner of the page. +# The default is 10, set it to 0 to disable auto-refreshing. +DAEMONSTATUS_REFRESH_INTERVAL = 10 + + +############################## Send Text ###################################### +########## usage in scrapy projects ########## +# See the "Send Text" page + +########## slack ########## +# How to create a slack app: +# 1. Visit https://api.slack.com/apps and press the "Create New App" button. +# 2. Enter your App Name (e.g. myapp)and select one of your Slack Workspaces, the press "Create App". +# 3. Click the "OAuth & Permissions" menu in the sidebar on the left side of the page. +# 4. Scroll down the page and find out "Select Permission Scopes" in the "Scopes" section +# 5. Enter "send" and select "Send messages as ", then press "Save Changes" +# 6. Scroll up the page and press "Install App to Workspace", then press "Install" +# 7. Copy the "OAuth Access Token", e.g. xoxp-123-456-789-abcde +# See https://api.slack.com/apps for more info + +# See step 1~7 above, e.g. 'xoxp-123-456-789-abcde' +SLACK_TOKEN = os.environ.get("SLACK_TOKEN", "") +# The default channel to use when sending text via slack, e.g. 'general' +SLACK_CHANNEL = "general" + +########## telegram ########## +# How to create a telegram bot: +# 1. Visit https://telegram.me/botfather to start a conversation with Telegram's bot that creates other bots. +# 2. Send the /newbot command to create a new bot in a chat with BotFather. +# 3. Follow the instructions to set up name and username (e.g. my_bot) for your bot. +# 4. You would get a token (e.g. 123:abcde) after step 3. +# 5. Visit telegram.me/ (e.g. telegram.me/my_bot) and say hi to your bot to initiate a conversation. +# 6. Visit https://api.telegram.org/bot/getUpdates to get the chat_id. +# (e.g. Visit https://api.telegram.org/bot123:abcde/getUpdates +# and you can find the chat_id in "chat":{"id":123456789,...) +# See https://core.telegram.org/bots#6-botfather for more info + +# See step 1~4 above, e.g. '123:abcde' +TELEGRAM_TOKEN = os.environ.get("TELEGRAM_TOKEN", "") +# See step 5~6 above, e.g. 123456789 +TELEGRAM_CHAT_ID = int(os.environ.get("TELEGRAM_CHAT_ID", 0)) + +########## email ########## +# The default subject to use when sending text via email. +EMAIL_SUBJECT = "Email from #scrapydweb" + +########## email sender & recipients ########## +# Leave this option as '' to default to the EMAIL_SENDER option below; Otherwise, set it up +# if your email service provider requires an username which is different from the EMAIL_SENDER option below to login. +# e.g. 'username' +EMAIL_USERNAME = "" +# As for different email service provider, you might have to get an APP password (like Gmail) +# or an authorization code (like QQ mail) and set it as the EMAIL_PASSWORD. +# Check out links below to get more help: +# https://stackoverflow.com/a/27515833/10517783 How to send an email with Gmail as the provider using Python? +# https://stackoverflow.com/a/26053352/10517783 Python smtplib proxy support +# e.g. 'password4gmail' +EMAIL_PASSWORD = os.environ.get("EMAIL_PASSWORD", "") + +# e.g. 'username@gmail.com' +EMAIL_SENDER = "" +# e.g. ['username@gmail.com', ] +EMAIL_RECIPIENTS = [EMAIL_SENDER] + +########## email smtp settings ########## +# Check out this link if you are using ECS of Alibaba Cloud and your SMTP server provides TCP port 25 only: +# https://www.alibabacloud.com/help/doc-detail/56130.htm +# Config for https://mail.google.com using SSL: ('smtp.gmail.com', 465, True) +# Config for https://mail.google.com: ('smtp.gmail.com', 587, False) +# Config for https://mail.qq.com using SSL: ('smtp.qq.com', 465, True) +# Config for http://mail.10086.cn: ('smtp.139.com', 25, False) +SMTP_SERVER = "" +SMTP_PORT = 0 +SMTP_OVER_SSL = False +# The timeout in seconds for the connection attempt, the default is 30. +SMTP_CONNECTION_TIMEOUT = 30 + + +############################## Monitor & Alert ################################ +# The default is False, set it to True to launch the poll subprocess to monitor your crawling jobs. +ENABLE_MONITOR = False + +########## poll interval ########## +# Tip: In order to be notified (and stop or forcestop a job when triggered) in time, +# you can reduce the value of POLL_ROUND_INTERVAL and POLL_REQUEST_INTERVAL, +# at the cost of burdening both CPU and bandwidth of your servers. + +# Sleep N seconds before starting next round of poll, the default is 300. +POLL_ROUND_INTERVAL = 300 +# Sleep N seconds between each request to the Scrapyd server while polling, the default is 10. +POLL_REQUEST_INTERVAL = 10 + +########## alert switcher ########## +# Tip: Set the SCRAPYDWEB_BIND option the in "QUICK SETUP" section to the actual IP of your host, +# then you can visit ScrapydWeb via the links attached in the alert. + +# The default is False, set it to True to enable alert via Slack, Telegram, or Email. +# You have to set up your accounts in the "Send text" section above first. +ENABLE_SLACK_ALERT = False +ENABLE_TELEGRAM_ALERT = False +ENABLE_EMAIL_ALERT = False + +########## alert working time ########## +# Monday is 1 and Sunday is 7. +# e.g, [1, 2, 3, 4, 5, 6, 7] +ALERT_WORKING_DAYS = [] + +# From 0 to 23. +# e.g. [9] + list(range(15, 18)) >>> [9, 15, 16, 17], or range(24) for 24 hours +ALERT_WORKING_HOURS = [] + +########## basic triggers ########## +# Trigger alert every N seconds for each running job. +# The default is 0, set it to a positive integer to enable this trigger. +ON_JOB_RUNNING_INTERVAL = 0 + +# Trigger alert when a job is finished. +# The default is False, set it to True to enable this trigger. +ON_JOB_FINISHED = False + +########## advanced triggers ########## +# - LOG_XXX_THRESHOLD: +# - Trigger alert the first time reaching the threshold for a specific kind of log. +# - The default is 0, set it to a positive integer to enable this trigger. +# - LOG_XXX_TRIGGER_STOP (optional): +# - The default is False, set it to True to stop current job automatically when reaching the LOG_XXX_THRESHOLD. +# - The SIGTERM signal would be sent only one time to shut down the crawler gracefully. +# - In order to avoid an UNCLEAN shutdown, the 'STOP' action would be executed one time at most +# - if none of the 'FORCESTOP' triggers is enabled, no matter how many 'STOP' triggers are enabled. +# - LOG_XXX_TRIGGER_FORCESTOP (optional): +# - The default is False, set it to True to FORCESTOP current job automatically when reaching the LOG_XXX_THRESHOLD. +# - The SIGTERM signal would be sent twice resulting in an UNCLEAN shutdown, without the Scrapy stats dumped! +# - The 'FORCESTOP' action would be executed if both of the 'STOP' and 'FORCESTOP' triggers are enabled. + +# Note that the 'STOP' action and the 'FORCESTOP' action would still be executed even when the current time +# is NOT within the ALERT_WORKING_DAYS and the ALERT_WORKING_HOURS, though no alert would be sent. + +LOG_CRITICAL_THRESHOLD = 0 +LOG_CRITICAL_TRIGGER_STOP = False +LOG_CRITICAL_TRIGGER_FORCESTOP = False + +LOG_ERROR_THRESHOLD = 0 +LOG_ERROR_TRIGGER_STOP = False +LOG_ERROR_TRIGGER_FORCESTOP = False + +LOG_WARNING_THRESHOLD = 0 +LOG_WARNING_TRIGGER_STOP = False +LOG_WARNING_TRIGGER_FORCESTOP = False + +LOG_REDIRECT_THRESHOLD = 0 +LOG_REDIRECT_TRIGGER_STOP = False +LOG_REDIRECT_TRIGGER_FORCESTOP = False + +LOG_RETRY_THRESHOLD = 0 +LOG_RETRY_TRIGGER_STOP = False +LOG_RETRY_TRIGGER_FORCESTOP = False + +LOG_IGNORE_THRESHOLD = 0 +LOG_IGNORE_TRIGGER_STOP = False +LOG_IGNORE_TRIGGER_FORCESTOP = False + + +############################## System ######################################### +# The default is False, set it to True to enable debug mode and the interactive debugger +# would be shown in the browser instead of the "500 Internal Server Error" page. +# Note that use_reloader is set to False in run.py +DEBUG = False + +# The default is False, set it to True to change the logging level from INFO to DEBUG +# for getting more information about how ScrapydWeb works, especially while debugging. +VERBOSE = False + +# The default is '', which means saving all program data in the Python directory. +# e.g. 'C:/Users/username/scrapydweb_data' or '/home/username/scrapydweb_data' +DATA_PATH = os.environ.get("DATA_PATH", "") + +# The default is '', which means saving data of Jobs and Timer Tasks in DATA_PATH using SQLite. +# The data could be also saved in MySQL or PostgreSQL backend in order to improve concurrency. +# To use MySQL backend, run command: pip install --upgrade pymysql +# To use PostgreSQL backend, run command: pip install --upgrade psycopg2 +# e.g. +# 'mysql://username:password@127.0.0.1:3306' +# 'postgres://username:password@127.0.0.1:5432' +# 'sqlite:///C:/Users/username' +# 'sqlite:////home/username' +DATABASE_URL = os.environ.get("DATABASE_URL", "") diff --git a/static/v140/css/dropdown.css b/static/v140/css/dropdown.css new file mode 100644 index 00000000..b1a99fe9 --- /dev/null +++ b/static/v140/css/dropdown.css @@ -0,0 +1,141 @@ +/* dropdown start */ +/* https://www.w3schools.com/css/css_dropdowns.asp */ +/* The container
- needed to position the dropdown content */ +.dropdown { + position: relative; + margin-left: 15px; + /* width: 220px; */ + /* width: -moz-fit-content; */ + /* width: fit-content; */ + min-width: 220px; + width: max-content; + width: -moz-max-content; /* Firefox on Ubuntu */ + height: 32px; + align-self: center; + border: 1px solid #67c23a; + border-radius: 2px; +} + +.dropdown:hover{ + /* background: #9DC8C8; */ + /* border: 1px solid #03a87c; */ +} + +.dropdown:hover .icon.anchor { + transform: rotate(-180deg); +} + +/* Style The Dropdown Button */ +.dropbtn { + color: #67c23a; + line-height: 32px; + padding: 0 16px; + font-size: 16px; + + cursor: pointer; + width: 100%; + display: flex; + box-sizing: border-box; + align-items: center; +} + +.dropbtn>span { + flex-grow: 1; +} + +.dropbtn>.ip{ + width: 1.5rem; + height: 1.5rem; + padding-right: .5rem; +} + +.dropbtn>.icon.anchor{ + width: 1rem; + height: 1rem; + transition: all .3s ease-out; +} + +.dropdown-content-wrap { + display: none; + /* width: 220px; */ + min-width: 220px; + width: max-content; + width: -moz-max-content; /* Firefox on Ubuntu */ + max-height: 70vh; /* 360px */ + overflow: hidden; + z-index: 9999; + position: absolute; + top: 100%; + transform: translateY(1px); + box-shadow: 0px 8px 16px 0px rgba(0,0,0,0.2); + background-color: #fff; +} + +/* Dropdown Content (Hidden by Default) */ +.dropdown-content { + min-width: 160px; + overflow-y: hidden; + flex-grow: 1; +} + +.dropdown-content:hover { + overflow-y: scroll; +} + +/* https://www.w3schools.com/howto/howto_css_custom_scrollbar.asp */ +/* Width */ +.dropdown-content::-webkit-scrollbar { + width: 6px; +} + +/* Track */ +.dropdown-content::-webkit-scrollbar-track { + border-radius: 6px; +} + +/* Handle */ +.dropdown-content::-webkit-scrollbar-thumb { + background: #9093994d; + border-radius: 6px; +} + +/* Handle on hover */ +.dropdown-content::-webkit-scrollbar-thumb:hover { + background: #90939980; +} + + +/* Links inside the dropdown */ +.dropdown-content a { + color: #777; + padding: 4px 16px; + text-decoration: none; + display: block; + line-height: 32px; + text-align: left; +} + +.dropdown-content div { + color: red; + line-height: 16px; + padding: 4px 16px; +} + +/* Change color of dropdown links on hover */ +.dropdown-content a:hover { + background-color: #f5f7fa; + color: #333; + font-weight: 500; +} + +/* Show the dropdown menu on hover */ +.dropdown:hover .dropdown-content-wrap { + display: flex; +} + +/* Change the background color of the dropdown button when the dropdown content is shown */ +.dropdown:hover .dropbtn { + /* color: #fff; */ +} + +/* dropdown END */ diff --git a/static/v140/css/dropdown_mobileui.css b/static/v140/css/dropdown_mobileui.css new file mode 100644 index 00000000..e615cb02 --- /dev/null +++ b/static/v140/css/dropdown_mobileui.css @@ -0,0 +1,114 @@ +/* dropdown START */ +/* https://www.w3schools.com/howto/howto_js_mobile_navbar.asp */ + +/* https://www.w3schools.com/css/css_dropdowns.asp */ +/* The container
- needed to position the dropdown content */ +.dropdown { + position: relative; + margin: 0px 12px; + min-width: 220px; + width: max-content; + width: -moz-max-content; /* Firefox on Ubuntu */ + height: 32px; + align-self: center; + border: 1px solid #67c23a; + border-radius: 2px; +} + +/* Style The Dropdown Button */ +.dropbtn { + color: #67c23a; + line-height: 32px; + padding: 0 16px; + font-size: 16px; + + cursor: pointer; + width: 100%; + display: flex; + box-sizing: border-box; + align-items: center; +} + +.dropbtn>span { + flex-grow: 1; + +} + +.dropbtn>.ip{ + width: 1.5rem; + height: 1.5rem; + padding-right: .5rem; +} + +.dropbtn>.icon.anchor{ + width: 1rem; + height: 1rem; + transition: all .3s ease-out; +} + +.dropdown-content-wrap { + display: none; + min-width: 220px; + width: max-content; + width: -moz-max-content; /* Firefox on Ubuntu */ + max-height: 70vh; + overflow: hidden; + z-index: 9999; + position: absolute; + top: 100%; + transform: translateY(1px); + box-shadow: 0px 8px 16px 0px rgba(0,0,0,0.2); + background-color: #fff; +} + +/* Dropdown Content (Hidden by Default) */ +.dropdown-content { + min-width: 160px; + overflow-y: hidden; + flex-grow: 1; +} + +.dropdown-content { + overflow-y: scroll; +} + +/* https://www.w3schools.com/howto/howto_css_custom_scrollbar.asp START */ +/* Width */ +.dropdown-content::-webkit-scrollbar { + width: 6px; +} + +/* Track */ +.dropdown-content::-webkit-scrollbar-track { + border-radius: 6px; +} + +/* Handle */ +.dropdown-content::-webkit-scrollbar-thumb { + background: #9093994d; + border-radius: 6px; +} + +/* Handle on hover */ +.dropdown-content::-webkit-scrollbar-thumb:hover { + background: #90939980; +} + +/* Links inside the dropdown */ +.dropdown-content a { + color: #777; + padding: 4px 16px; + text-decoration: none; + display: block; + line-height: 32px; + text-align: left; +} +/* https://www.w3schools.com/howto/howto_css_custom_scrollbar.asp END */ + +.dropdown-content div { + color: red; + line-height: 16px; + padding: 4px 16px; +} + +/* dropdown END */ diff --git a/static/v140/css/icon_upload_icon_right.css b/static/v140/css/icon_upload_icon_right.css new file mode 100644 index 00000000..4c9a5d37 --- /dev/null +++ b/static/v140/css/icon_upload_icon_right.css @@ -0,0 +1,23 @@ + +@font-face {font-family: "iconfont"; + src: url('//at.alicdn.com/t/font_804951_pwq5pv3hrpi.eot?t=1535375413078'); /* IE9*/ + src: url('//at.alicdn.com/t/font_804951_pwq5pv3hrpi.eot?t=1535375413078#iefix') format('embedded-opentype'), /* IE6-IE8 */ + url('data:application/x-font-woff;charset=utf-8;base64,d09GRgABAAAAAAWUAAsAAAAACBgAAQAAAAAAAAAAAAAAAAAAAAAAAAAAAABHU1VCAAABCAAAADMAAABCsP6z7U9TLzIAAAE8AAAARAAAAFY8mkgjY21hcAAAAYAAAABbAAABlM2RNUhnbHlmAAAB3AAAAbUAAAHkzMRI92hlYWQAAAOUAAAALwAAADYSc3ZqaGhlYQAAA8QAAAAcAAAAJAfeA4VobXR4AAAD4AAAAA4AAAAQEAAAAGxvY2EAAAPwAAAACgAAAAoBjgDEbWF4cAAAA/wAAAAeAAAAIAEQAHduYW1lAAAEHAAAAUUAAAJtPlT+fXBvc3QAAAVkAAAALgAAAD8guUVKeJxjYGRgYOBikGPQYWB0cfMJYeBgYGGAAJAMY05meiJQDMoDyrGAaQ4gZoOIAgCKIwNPAHicY2BkYWCcwMDKwMHUyXSGgYGhH0IzvmYwYuRgYGBiYGVmwAoC0lxTGByeqTzTY27438AQw9zM0AAUZgTJAQDjpAw4eJztkLENgDAMBM8kpECMQUlSMQ0V02eN8LEZg5fOL7/l5oEVSOIUGezBmLqVmueJzfNM017kxtKP3sYAeZ0umW5FvsSnFX7tPq9vS7OvQC3Ra+B5C7AXNbYQzgB4nDWRz4rTUBTG73dj7r2Z3Cm5JU1SkXRMJK3tGKFNE/pHZ2yLDmqdakdwlsowFRF0IehScO/KjeATzAMIXehG0FcSF6l3BA+Hc74D3/ktziGUkM0fY2PskIfkBSGogMdRkuUj3ESRosmjpMj7WUrjiPEKmok21NxzyVkthN/rei7rQDuzIi/0Ttc/b71uSH0ewmTRdSiXRUlTcV2zPeTdBjzX2NDVfG1Jaa3HC2AxwpYEjfeX6cGb+gVPhI+HEDxoOzs4npxZtm2dHb2iV+M+Oxx2roRF/KCdP7vfAm4fl++Yw6dS2ZgyZ5udcoedMhzNV1QqSQ/HowVFxXK200e3ouVkyxQhhh2T25Rfctv55Mk/2+vlrH/NHdxDcKMV1Fvzp7m0Tp4zNi0/CykFVlMmlebqVEQH9N3e09/GB1IlxAJzfXjdPGsiyQrkKejPcuYH+C48R5QTwfHNFg3jbTmu79fLO0LVBNbCs/DDbvD/vC8GMTrEJ8SMklxTNC3Vb/ARouZy/RoMgstmFS+VKj8qM9rV8mKjV36yvJNYD3dVpDCo7samU/7y9uJZr/xqMvIXuVxPmQAAAHicY2BkYGAAYrPsjY7x/DZfGbhZGEDg+ipJUwT9v4GFgbkZyOVgYAKJAgACIQj1AHicY2BkYGBu+N/AEMPCAAJAkpEBFbAAAEcKAm14nGNhYGBgQcIAALAAEQAAAAAAAACcAMQA8gAAeJxjYGRgYGBhyGZgYgABEMkFhAwM/8F8BgATVwGIAAB4nGWPTU7DMBCFX/oHpBKqqGCH5AViASj9EatuWFRq911036ZOmyqJI8et1ANwHo7ACTgC3IA78EgnmzaWx9+8eWNPANzgBx6O3y33kT1cMjtyDRe4F65TfxBukF+Em2jjVbhF/U3YxzOmwm10YXmD17hi9oR3YQ8dfAjXcI1P4Tr1L+EG+Vu4iTv8CrfQ8erCPuZeV7iNRy/2x1YvnF6p5UHFockikzm/gple75KFrdLqnGtbxCZTg6BfSVOdaVvdU+zXQ+ciFVmTqgmrOkmMyq3Z6tAFG+fyUa8XiR6EJuVYY/62xgKOcQWFJQ6MMUIYZIjK6Og7VWb0r7FDwl57Vj3N53RbFNT/c4UBAvTPXFO6stJ5Ok+BPV8bUnV0K27LnpQ0kV7NSRKyQl7WtlRC6gE2ZVeOEXpc0Yk/KGdI/wAJWm7IAAAAeJxjYGKAAC4G7ICFkYmRmZGFkZWBrbQgJz8xhSUpMTmbtSgzPaOEgQEASqgGZgAA') format('woff'), + url('//at.alicdn.com/t/font_804951_pwq5pv3hrpi.ttf?t=1535375413078') format('truetype'), /* chrome, firefox, opera, Safari, Android, iOS 4.2+*/ + url('//at.alicdn.com/t/font_804951_pwq5pv3hrpi.svg?t=1535375413078#iconfont') format('svg'); /* iOS 4.1- */ +} + +.iconfont { + font-family:"iconfont" !important; + font-size:16px; + font-style:normal; + -webkit-font-smoothing: antialiased; + -moz-osx-font-smoothing: grayscale; +} + +.icon-upload:before { content: "\e62e"; } + +.icon-back:before { content: "\e624"; } + +.icon-right:before { content: "\e62d"; } + diff --git a/static/v140/css/multinode.css b/static/v140/css/multinode.css new file mode 100644 index 00000000..97ed513c --- /dev/null +++ b/static/v140/css/multinode.css @@ -0,0 +1,7 @@ +input[type=checkbox] {zoom: 150%} +table { + margin-top: 10px; + margin-bottom: 30px; +} +.action {margin: 20px 0;} +.state.normal, .state.safe {display: table-cell;} \ No newline at end of file diff --git a/static/v140/css/stacktable.css b/static/v140/css/stacktable.css new file mode 100644 index 00000000..baefb9c3 --- /dev/null +++ b/static/v140/css/stacktable.css @@ -0,0 +1,18 @@ +.stacktable { width: 100%; } +.st-head-row { padding-top: 1em; } +.st-head-row.st-head-row-main { font-size: 1.5em; padding-top: 0; } +.st-key { width: 49%; text-align: right; padding-right: 1%; } +.st-val { width: 49%; padding-left: 1%; } + + + +/* RESPONSIVE EXAMPLE */ + +.stacktable.large-only { display: table; } +.stacktable.small-only { display: none; } + +/* iPhone 6S 750 * 1334 */ +@media (max-width: 639px) { + .stacktable.large-only { display: none; } + .stacktable.small-only { display: table; } +} \ No newline at end of file diff --git a/static/v140/css/stats.css b/static/v140/css/stats.css new file mode 100644 index 00000000..2f3443a8 --- /dev/null +++ b/static/v140/css/stats.css @@ -0,0 +1,50 @@ +table .blue {color: #409EFF;} +table .green {color: #67c23a;} +table .orange {color: orange;} +table .red {color: red;} +table .blue, .green, .orange, .red {font-size: 20px;} + +#content pre { + white-space: pre-wrap; + /* word-wrap: break-word; */ +} + +#content .tab-content .table { + border: 1px solid #d8d8d8; +} + +#content a.icon { + /* display: flex; */ + width: 50px; + height: 30px; + font-size: 20px; + color: #707070; + /* justify-content: center; */ + /* align-items: center; */ +} + +#content .icon-back { + width: 100%; + height: 100%; +} + +#content .icon-back:before { + display: inline-block; + width: 60px; + height: 30px; + font-size: 20px; + line-height: 30px; + text-align: center; + border-radius: 3px; + background: #409EFF; + color: #fff; + opacity: 0.7; + transition: .1s all ease-in-out; +} + +#content .icon-back:hover:before { + color: #409EFF; + border-color: #409EFF; + opacity: 1; + color: #fff; +} diff --git a/static/v140/css/style.css b/static/v140/css/style.css new file mode 100644 index 00000000..16f76280 --- /dev/null +++ b/static/v140/css/style.css @@ -0,0 +1,938 @@ +body, +div, +ul, +ol, +li, +p, +input { + margin: 0; + padding: 0; +} + +html { + height: 100%; +} + +body { + min-width: 600px; + height: 100%; + display: flex; + flex-direction: column; + background: rgba(0,0,0,0.05); + font-size: 16px; + font-family: Cambria, Georgia, Times, "Times New Roman", serif; + color: #555; +} + +.icon { + width: 1.25em; + height: 1.25em; + vertical-align: -0.175em; + fill: currentColor; + overflow: hidden; +} + + +em.normal {color: #feb324;} +em.pass {color: #67c23a;} +em.fail {color: red;} + +em.normal, +em.pass, +em.fail { + font-size: large; + font-weight: bold; +} + +.forbid {cursor: not-allowed;} + +pre { + white-space: pre-wrap; + background-color: #e3e3e3; +} + +a { + cursor: pointer; + text-decoration: none; + color: #409EFF; +} + +aside a {color: #909090;} +aside a:hover {color: #3d3d3d;} + + +a.link {color: #2980B9;} +a.link:hover {color: #409EFF;} +a.link:visited { + /* color: #9B59B6; */ +} + +a.request {text-decoration: underline;} + + +ul, +ol { + list-style: none; +} + +/* ul#links{ */ + /* position: absolute; */ + /* bottom: 0; */ + /* margin-top: 30px; */ + /* margin-bottom: 30px; */ +/* } */ +/* @media (max-height: 800px) { */ + /* ul#links { position: relative; } */ +/* } */ + +.github { + margin: 16px 0 16px 20px; +} + +.clear-float { + zoom: 1; +} + +/* ==for IE6/7 Maxthon2== */ +.clear-float:after { + clear: both; + content: ''; + display: block; + width: 0; + height: 0; + visibility: hidden; +} + + +/* ==for FF/chrome/opera/IE8== */ +nav { + height: 50px; + min-height: 50px; + line-height: 50px; + box-shadow: 0 0 18px 0 #d4dee6; + color: #909090; + padding-right: 60px; + display: flex; + + background: #fff; + border-bottom: 1px solid #e3e3e3; +} + +nav>.title { + display: inline-block; + font-size: 20px; + width: 160px; + min-width: 160px; + color: #fff; + background: #feb324; + text-align: center; + position: relative; + + height: 51px; +} + +nav>.title:after { + content: ''; + position: absolute; + display: block; + width: 0; + height: 0; + border-left: 80px solid transparent; + border-right: 80px solid transparent; + border-top: 10px solid #feb324; + border-bottom: 10px solid transparent; + top: 100%; + left: 0; +} + +nav>.title>.version { + font-size: 14px; + color: #aaa; + position: absolute; + left: 0; + right: 0; + top: 100%; + z-index: 1; +} + +nav>.switch-node { + text-align: center; + min-width: 100px; + z-index: 2; +} +nav>.switch-node>a { + color: #909090; + line-height: 50px; + font-size: 30px; + padding: 0 10px; +} +nav>.switch-node>a:hover {color: #409EFF;} +nav>.switch-node span {font-size: 18px;} + + +nav>#nav_daemonstatus{ + flex-grow: 1; + text-align: right; +} + +nav>#nav_daemonstatus>li{ + display: inline-block; +} + +/* https://www.w3schools.com/howto/howto_css_arrows.asp */ +nav i { + border: solid black; + border-width: 0 3px 3px 0; + display: inline-block; + padding: 3px; + font-size: 16px; + width: 4px; + height: 4px; +} + +#nav_daemonstatus span { + margin-left: 4px; + margin-right: 24px; +} + + +main { + display: flex; + flex-grow: 1; + /* Fix the bug that
diff --git a/templates/scrapydweb/logs_items.html b/templates/scrapydweb/logs_items.html new file mode 100644 index 00000000..4645e20c --- /dev/null +++ b/templates/scrapydweb/logs_items.html @@ -0,0 +1,135 @@ +{% extends 'base.html' %} + +{% block title %}{{ title }}{% endblock %} + +{% block head %} + +{% endblock %} + + +{% block body %} +

+ + Directory listing for /{{ title }}/{% if project %}{{ project }}/{% endif %}{% if spider %}{{ spider }}/{% endif %} + +

+ +
+ +
+ + + +{% endblock %} diff --git a/templates/scrapydweb/multinode_results.html b/templates/scrapydweb/multinode_results.html new file mode 100644 index 00000000..f2f89860 --- /dev/null +++ b/templates/scrapydweb/multinode_results.html @@ -0,0 +1,146 @@ +{% extends 'base.html' %} + +{% block title %}multinode results{% endblock %} + +{% block head %} + + +{% endblock %} + + +{% block body %} +

{{ title }}

+ + +
+ + + + + + + {% if opt == 'stop' %} + + {% else %} + + {% endif %} + + + + + + {% if opt == 'stop' %} + + + {% elif opt == 'delversion' %} + + {% endif %} + + + + + + {% for SCRAPYD_SERVER in SCRAPYD_SERVERS %} + {% if loop.index in selected_nodes %} + + + + + + + + + {% if opt == 'stop' %} + + + {% elif opt == 'delversion' %} + + {% endif %} + + {% endif %} + {% endfor %} + + +
+ + invert selection + IndexJobsProjectsNode nameStatusProjectJobPrev stateVersion
{{ loop.index }} + 1 %}"_blank"{% else %}"_self"{% endif %} + >{{ SCRAPYD_SERVERS[loop.index-1] }} +
+ + +
+ + + +{% endblock %} diff --git a/templates/scrapydweb/node_reports.html b/templates/scrapydweb/node_reports.html new file mode 100644 index 00000000..de2f6f8d --- /dev/null +++ b/templates/scrapydweb/node_reports.html @@ -0,0 +1,267 @@ +{% extends 'base.html' %} + +{% block title %}servers{% endblock %} + +{% block head %} + +{% endblock %} + +{% block body %} +

+ Get the reports of running and finished jobs of all projects after Scrapyd server started. + +

+ + + +
+ +
+ + + +{% endblock %} diff --git a/templates/scrapydweb/parse.html b/templates/scrapydweb/parse.html new file mode 100644 index 00000000..38f1ff65 --- /dev/null +++ b/templates/scrapydweb/parse.html @@ -0,0 +1,57 @@ +{% extends 'base.html' %} + +{% block title %}log parser{% endblock %} + +{% block head %} + +{% endblock %} + + +{% block body %} +

Upload a scrapy logfile to parse.

+
+
+
+ + + a log or txt file + + + + + demo +
+
+
+ + + +{% endblock %} diff --git a/templates/scrapydweb/projects.html b/templates/scrapydweb/projects.html new file mode 100644 index 00000000..a7847eda --- /dev/null +++ b/templates/scrapydweb/projects.html @@ -0,0 +1,80 @@ +{% extends 'base.html' %} + +{% block title %}projects{% endblock %} + +{% block head %} + +{% endblock %} + +{% block body %} +

+ Get the list of projects uploaded to this Scrapy server. + +

+ +
+

{{ node_name }}

+
    + {% if not results %} +
  • No projects found. Go to Deploy Project
  • + {% endif %} + + {% for project, url_listversions in results %} +
  • +
    +

    {{ project }}

    + +
    +
    +
  • + {% endfor %} +
+
+ + + + + +{% endblock %} diff --git a/templates/scrapydweb/schedule.html b/templates/scrapydweb/schedule.html new file mode 100644 index 00000000..0bb1fc48 --- /dev/null +++ b/templates/scrapydweb/schedule.html @@ -0,0 +1,919 @@ +{% extends 'base.html' %} + +{% block title %}run spider{% endblock %} + +{% block head %} + + + + + + + {% if SCRAPYD_SERVERS_AMOUNT > 1 %} + + {% endif %} +{% endblock %} + + +{% block body %} + + +
+
    +
  • +
    +

    HELP

    + +
    + +
      + Run Spider

      +
    • project (string, required) - the project name
    • +
    • _version (string, optional) - the version of the project to use, default: the latest version
    • +
    • spider (string, required) - the spider name
    • +
    • jobid (string, optional) - a job id used to identify the job, overrides the default generated UUID
    • +
    • setting (string, optional) - a Scrapy setting to use when running the spider. + (See Scrapy settings) +
    • +
    • any other parameter is passed as spider argument. + (See Scrapy spider-arguments) +
    • +
      + Timer Task

      +
    • To get the local timezone of your host: +
      $ pip install tzlocal
      +from tzlocal import get_localzone
      +print(get_localzone())
      +
    • +
      +
    • Keep in mind that the start_date limit would become invalid if you select the action to fire the task right now.
    • +
      +
    • Also note that the misfire_grace_time would be saved as None if it's set to 0, which means that the scheduler would reschedule the misfired task no matter how long it has been missed.
    • +
    • Otherwise, just set misfire_grace_time to 1 to ignore any misfired task missed by more than 1 second.
    • +
      +
    • + Check out apscheduler.triggers.cron + and apscheduler.schedulers.base.BaseScheduler.add_job + and userguide for more info. +
    • +
    +
  • +
+
+ + +
+
+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + +
+ {% if task_id %} + + + + + + + + + {% endif %} + + + + + + + + + + + + + + + + + + + + + +
4-digit year, e.g. 2019;
Defaults to * for any year
+ +
+
+ + + + + +
month (1-12);
Defaults to * for any month;
e.g. 1,6-8 equals to 1,6,7,8
+ +
+
+ + + + + +
day (1-31);
Defaults to * for any day;
Can be `1st mon` or `last sun` of the month
+ +
+
+ + + + + +
ISO week (1-53);
Defaults to * for any week;
`import datetime; print(datetime.date(2019, 12, 31).isocalendar()[1])`
+ +
+
+ + + + + + + + + + + + + + + + + + +
hour (0-23);
Defaults to * for any hour;
e.g. 9,17,8-20/4 equals to 8,9,12,16,17,20
+ +
+
+ + + + + +
minute (0-59);
Defaults to 0 instead of *;
e.g. */10 to fire every 10 mins
+ +
+
+ + + + + +
second (0-59);
Defaults to 0 instead of *;
Caution with value *
+ +
+
+ + + + + + + + + + + + + + +
To get the local timezone of your host,
run command 'pip install tzlocal' first,
then execute `from tzlocal import get_localzone; print(get_localzone())`
+ +
+
+ + + + + +
Execute task by random delay of [-N, +N] secs;
Defaults to 0;
Search 'jitter' in apscheduler.readthedocs.io/en/latest/modules/triggers/cron.html
+ +
+
+ + + + + + +
Max tolerance of delay for misfired task;
Defaults to 600 secs;
See apscheduler.readthedocs.io/en/latest/userguide.html#missed-job-executions-and-coalescing
+ +
+
+ + + + + + + + + + + + + +
Max concurrently running instances of this task;
Defaults to 1;
See apscheduler.readthedocs.io/en/latest/userguide.html#limiting-the-number-of-concurrently-executing-instances-of-a-job
+ +
+
+ + + + +
+ + + + + + + + +
+
+ +
+
+ + {% if SCRAPYD_SERVERS_AMOUNT > 1 %} + {% include 'scrapydweb/include_multinodes_checkboxes.html' %} + {% endif %} +
+ + +
+
+ + + + + + +{% endblock %} diff --git a/templates/scrapydweb/schedule_results.html b/templates/scrapydweb/schedule_results.html new file mode 100644 index 00000000..08e5fef5 --- /dev/null +++ b/templates/scrapydweb/schedule_results.html @@ -0,0 +1,148 @@ +{% extends 'base.html' %} + +{% block title %}run results{% endblock %} + +{% block head %} + + +{% endblock %} + + +{% block body %} +

Run Spider

+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + {% for SCRAPYD_SERVER in SCRAPYD_SERVERS %} + {% if loop.index in selected_nodes and loop.index != first_selected_node %} + + + + + + + + + + + + {% endif %} + {% endfor %} + + + +
+ + invert selection + IndexStatsNode NameStatusProjectVersionSpiderJobid
{{ first_selected_node }} + 1 %}"_blank"{% else %}"_self"{% endif %} + >{{ SCRAPYD_SERVERS[first_selected_node-1] }} + {{ js['node_name'] }}{{ js['status'] }}{{ project }}{{ version }}{{ spider }}{{ js['jobid'] }}
{{ loop.index }} + 1 %}"_blank"{% else %}"_self"{% endif %} + >{{ SCRAPYD_SERVERS[loop.index-1] }} +
+ + +
+ + +{% endblock %} diff --git a/templates/scrapydweb/send_text.html b/templates/scrapydweb/send_text.html new file mode 100644 index 00000000..47731faa --- /dev/null +++ b/templates/scrapydweb/send_text.html @@ -0,0 +1,63 @@ +{% extends 'base.html' %} + +{% block title %}send text{% endblock %} + +{% block head %} + +{% endblock %} + + +{% block body %} +

+ Send text via + Slack, + Telegram, or + Email (click to test) +

+
+"""
+How to send text and get alerted in scrapy projects:
+"""
+
+import scrapy
+from w3lib.http import basic_auth_header
+
+
+# Suppose ScrapydWeb is available at http://127.0.0.1:5000
+# Do remember to add '127.0.0.1' to allowed_domains
+base_url = 'http://127.0.0.1:5000'
+headers = dict(Authorization=basic_auth_header('username', 'password'))
+callback_none = lambda x: None
+
+# Via Slack:
+yield scrapy.Request(base_url+'/slack/some-text-to-channel-general', headers=headers, callback=callback_none)
+yield scrapy.Request(base_url+'/slack/random/send-to-channel-random', callback=callback_none)
+yield scrapy.FormRequest(url=base_url+'/slack',
+                         headers=headers,
+                         formdata=dict(channel='random', key='value', a='1'),
+                         callback=callback_none)
+
+# Via Telegram:
+yield scrapy.Request(base_url+'/tg/some-text-to-telegram', callback=callback_none)
+yield scrapy.Request(base_url+'/tg/123/some-text-to-chat-id-123', callback=callback_none)
+# JSONRequest is available in Scrapy>=1.7.1, in which values in the data to post could be int type.
+yield scrapy.http.JSONRequest(url=base_url+'/tg',
+                              data=dict(chat_id=123, key='value', b=2),
+                              callback=callback_none)
+
+# Via Email:
+yield scrapy.Request(base_url+'/email/some-text-to-email', callback=callback_none)
+yield scrapy.Request(base_url+'/email/new-subject/send-with-new-subject', callback=callback_none)
+yield scrapy.FormRequest(url=base_url+'/email',
+                         formdata=dict(recipients='name1@example.com; name2@example.com',
+                                       subject='post to send an email', key='value', c='3'),
+                         callback=callback_none)
+
+
+
+ + + +{% endblock %} diff --git a/templates/scrapydweb/servers.html b/templates/scrapydweb/servers.html new file mode 100644 index 00000000..737f0d2e --- /dev/null +++ b/templates/scrapydweb/servers.html @@ -0,0 +1,954 @@ +{% extends 'base.html' %} + +{% block title %}servers{% endblock %} + +{% block head %} + + {% if SCRAPYD_SERVERS_AMOUNT > 1 and (pageview == 1 or pageview % CHECK_LATEST_VERSION_FREQ == 0) %} + + + {% else %} + + {% endif %} +{% endblock %} + + +{% block body %} +{% if (SCRAPYD_SERVERS_AMOUNT > 1 and pageview == 1) or IS_IE_EDGE %} + +{% endif %} + + +

Monitor and control all of your Scrapyd servers.

+ + + +
+ +
+ + + + + +{% endblock %} diff --git a/templates/scrapydweb/settings.html b/templates/scrapydweb/settings.html new file mode 100644 index 00000000..f97787bd --- /dev/null +++ b/templates/scrapydweb/settings.html @@ -0,0 +1,203 @@ +{% extends 'base.html' %} + +{% block title %}settings{% endblock %} + +{% block head %} + +{% endblock %} + +{% block body %} +

Settings

+ +
+
    +
  • +

    default_settings.py

    +

    {{ DEFAULT_SETTINGS_PY_PATH }}

    +
  • +
  • +

    user settings

    +

    {{ SCRAPYDWEB_SETTINGS_PY_PATH }}

    +
  • +
  • main_pid: {{ MAIN_PID }}

  • +
  • logparser_pid: {{ LOGPARSER_PID }}

  • +
  • poll_pid: {{ POLL_PID }}

  • +
+
+ +
+ +
    + +
    +

    ScrapydWeb

    +
      +
    • +

      * server *

      +
      {{ scrapydweb_server }}
      +
    • +
    • +

      ENABLE_HTTPS = {{ ENABLE_HTTPS }}

      +
      {{ enable_https_details }}
      +
    • +
    +
    + +
    +

    Scrapy

    +
      +
    • +

      SCRAPY_PROJECTS_DIR

      +
      {{ SCRAPY_PROJECTS_DIR }}
      +
    • +
    +
    + +
    +

    Scrapyd

    +
      +
    • +

      * servers *

      +
      {{ servers }}
      +
    • +
    • +

      LOCAL_SCRAPYD_SERVER

      +
      {{ LOCAL_SCRAPYD_SERVER }}
      +
    • +
    • +

      LOCAL_SCRAPYD_LOGS_DIR

      +
      {{ LOCAL_SCRAPYD_LOGS_DIR }}
      +
    • +
    • +

      SCRAPYD_LOG_EXTENSIONS

      +
      {{ SCRAPYD_LOG_EXTENSIONS }}
      +
    • +
    +
    + +
    +

    LogParser

    +
      +
    • ENABLE_LOGPARSER = {{ ENABLE_LOGPARSER }}

    • +
    • version: {{ logparser_version }}

    • +
    • +

      settings.py

      +

      {{ logparser_settings_py_path }}

      +
    • +
    • BACKUP_STATS_JSON_FILE = {{ BACKUP_STATS_JSON_FILE }}

    • +
    +
    + +
    +

    Timer tasks

    +
      +
    • scheduler.state: {{ scheduler_state }}

    • +
    • JOBS_SNAPSHOT_INTERVAL = {{ JOBS_SNAPSHOT_INTERVAL }}

    • +
    +
    + +
    +

    Run Spider

    +
      +
    • +

      details

      +
      {{ run_spider_details }}
      +
    • +
    +
    + +
    +

    Page Display

    +
      +
    • +

      details

      +
      {{ page_display_details }}
      +
    • +
    +
    + +
    +

    Send Text

    +
      +
    • +

      Slack

      +
      {{ slack_details }}
      +
    • +
    • +

      Telegram

      +
      {{ telegram_details }}
      +
    • +
    • +

      Email

      +
      {{ email_details }}
      +
    • +
    • +

      email sender & recipients

      +
      {{ email_sender_recipients }}
      +
    • +
    • +

      email smtp settings

      +
      {{ email_smtp_settings }}
      +
    • +
    +
    + +
    +

    Monitor & Alert

    +
      +
    • ENABLE_MONITOR = {{ ENABLE_MONITOR }}

    • +
    • +

      poll interval

      +
      {{ poll_interval }}
      +
    • +
    • +

      alert switcher

      +
      {{ alert_switcher }}
      +
    • +
    • +

      alert working time

      +
      {{ alert_working_time }}
      +
    • +
    • +

      triggers

      +
      {{ alert_triggers|safe }}
      +
    • +
    +
    + +
    +

    System

    +
      +
    • DEBUG = {{ DEBUG }}

    • +
    • VERBOSE = {{ VERBOSE }}

    • +
    • +

      DATA_PATH

      +
      {{ DATA_PATH }}
      +
    • +
    • +

      DATABASE

      +
      {{ database_details }}
      +
    • +
    +
    + +
+
+ + +{% endblock %} diff --git a/templates/scrapydweb/stats.html b/templates/scrapydweb/stats.html new file mode 100644 index 00000000..2ca8cf75 --- /dev/null +++ b/templates/scrapydweb/stats.html @@ -0,0 +1,426 @@ +{% extends 'base.html' %} + +{% block title %}stats{% endblock %} + +{% block head %} + + + + +{% endblock %} + + +{% block loader %} +
+ + +{% endblock %} + + +{% block body %} +

PROJECT ({{ project }}), SPIDER ({{ spider }})

+ +{% if url_refresh %} +
+ {% if not url_jump %} + Click to refresh + {% else %} + {% if 'realtime=True' in url_jump %} + Click to refresh + Realtime version + {% else %} + Cached version + Click to refresh + {% endif %} + {% endif %} +
+{% endif %} + +
+
    +
  • Log analysis
  • +
  • Log categorization
  • + {% if datas|length > 1 %} +
  • Progress visualization
  • + {% endif %} +
  • View log
  • + {% if crawler_stats %} +
  • Crawler.stats
  • + {% endif %} + {% if crawler_engine %} +
  • Crawler.engine
  • + {% endif %} +
+
    +
  • +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    project{{ project }}
    spider{{ spider }}
    job{{ job }}
    first_log_time{{ first_log_time }}
    latest_log_time{{ latest_log_time }}
    runtime{{ runtime }}
    crawled_pages + {% if pages is none %} + N/A + {% else %} + {{ pages }} + {% endif %} +
    scraped_items + {% if items is none %} + N/A + {% else %} + {{ items }} + {% endif %} +
    shutdown_reason{{ shutdown_reason }}
    finish_reason{{ finish_reason }}
    log_critical_count{{ log_categories['critical_logs']['count'] }}
    log_error_count{{ log_categories['error_logs']['count'] }}
    log_warning_count{{ log_categories['warning_logs']['count'] }}
    log_redirect_count{{ log_categories['redirect_logs']['count'] }}
    log_retry_count{{ log_categories['retry_logs']['count'] }}
    log_ignore_count{{ log_categories['ignore_logs']['count'] }}
    latest_crawl
    latest_scrape
    latest_log
    current_time
    latest_item{{ latest_matches['latest_item'] or 'N/A' }}
    +
    +
  • + +
  • +
    +
      +

      WARNING+

      + {% for title in ['critical_logs', 'error_logs', 'warning_logs'] %} + {% if not log_categories[title]['count'] %} + {% else %} +
    • +
      +

      +
      {{ title }}
      + {% if log_categories[title]['details']|length < log_categories[title]['count'] %} + last {{ log_categories[title]['details']|length }} of {{ log_categories[title]['count'] }} + {% else %} + {{ log_categories[title]['count'] }} in total + {% endif %} +

      + +
      + + {% for detail in log_categories[title]['details'] %} +
      {{ detail }}
      + {% endfor %} +
    • + {% endif %} + {% endfor %} +
    +
    + +
    +
      +

      INFO

      + + {% for title in ['redirect_logs', 'retry_logs', 'ignore_logs'] %} + {% if not log_categories[title]['count'] %} + {% else %} +
    • +
      +

      +
      {{ title }}
      + {% if log_categories[title]['details']|length < log_categories[title]['count'] %} + last {{ log_categories[title]['details']|length }} of {{ log_categories[title]['count'] }} + {% else %} + {{ log_categories[title]['count'] }} in total + {% endif %} +

      + +
      + + {% for detail in log_categories[title]['details'] %} +
      {{ detail }}
      + {% endfor %} +
    • + {% endif %} + {% endfor %} +
    +
    + +
    +
      +

      DEBUG

      + + {% for title, log in latest_matches.items() %} + {% if not log %} + {% else %} +
    • +
      +

      {{ title }}

      + +
      +
      {{ log }}
      +
    • + {% endif %} + {% endfor %} +
    +
    +
  • + + {% if datas|length > 1 %} +
  • +
    +
    +
  • + {% endif %} + +
  • +
    + +
      +
    • +
      +

      Head

      + +
      +
      {{ head }}
      +
    • +
    • +
      +

      Tail

      + +
      +
      {{ tail }}
      +
    • + {% if url_opt_opposite %} +
    • +
      +

      Log

      + +
      +

      {{ url_opt_opposite }}

      +
    • + {% endif %} + {% if url_source %} +
    • +
      +

      Source

      + +
      +

      {{ url_source }}

      +
    • + {% endif %} +
    +
    +
  • + + {% if crawler_stats %} +
  • +
    + + {% for k, v in crawler_stats.items() %} + + {% endfor %} +
    {{ k }}{{ v }}
    +
    +
  • + {% endif %} + + {% if crawler_engine %} +
  • +
    + + {% for k, v in crawler_engine.items() %} + + {% endfor %} +
    {{ k }}{{ v }}
    +
    +
  • + {% endif %} +
+
+ + + + + + + + +{% if datas|length > 1 %} + +{% endif %} + + + + + + +{% endblock %} diff --git a/templates/scrapydweb/stats_mobileui.html b/templates/scrapydweb/stats_mobileui.html new file mode 100644 index 00000000..64883d10 --- /dev/null +++ b/templates/scrapydweb/stats_mobileui.html @@ -0,0 +1,424 @@ +{% extends 'base_mobileui.html' %} + +{% block title %}stats{% endblock %} + +{% block head %} + + + + + +{% endblock %} + +{% block loader %} +
+ + +{% endblock %} + + +{% block neck %} +Jobs + +{% if url_refresh %} +Parsed +{% endif %} +{% endblock %} + + +{% block body %} +
+
    +
  • Analysis
  • +
  • Categories
  • + {% if datas|length > 1 %} +
  • Charts
  • + {% endif %} +
  • Logs
  • + {% if crawler_stats %} +
  • Crawler.stats
  • + {% endif %} + {% if crawler_engine %} +
  • Crawler.engine
  • + {% endif %} +
+ +
    +
  • +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    project{{ project }}
    spider{{ spider }}
    job{{ job }}
    first_log_time{{ first_log_time }}
    latest_log_time{{ latest_log_time }}
    runtime{{ runtime }}
    crawled_pages + {% if pages is none %} + N/A + {% else %} + {{ pages }} + {% endif %} +
    scraped_items + {% if items is none %} + N/A + {% else %} + {{ items }} + {% endif %} +
    shutdown_reason{{ shutdown_reason }}
    finish_reason{{ finish_reason }}
    log_critical_count{{ log_categories['critical_logs']['count'] }}
    log_error_count{{ log_categories['error_logs']['count'] }}
    log_warning_count{{ log_categories['warning_logs']['count'] }}
    log_redirect_count{{ log_categories['redirect_logs']['count'] }}
    log_retry_count{{ log_categories['retry_logs']['count'] }}
    log_ignore_count{{ log_categories['ignore_logs']['count'] }}
    latest_crawl
    latest_scrape
    latest_log
    current_time
    latest_item{{ latest_matches['latest_item'] or 'N/A' }}
    +
    +
  • + +
  • +
    +
      +

      WARNING+

      + {% for title in ['critical_logs', 'error_logs', 'warning_logs'] %} + {% if not log_categories[title]['count'] %} + {% else %} +
    • +
      +

      +
      {{ title }}
      + {% if log_categories[title]['details']|length < log_categories[title]['count'] %} + last {{ log_categories[title]['details']|length }} of {{ log_categories[title]['count'] }} + {% else %} + {{ log_categories[title]['count'] }} in total + {% endif %} +

      + +
      + + {% for detail in log_categories[title]['details'] %} +
      {{ detail }}
      + {% endfor %} +
    • + {% endif %} + {% endfor %} +
    +
    + +
    +
      +

      INFO

      + + {% for title in ['redirect_logs', 'retry_logs', 'ignore_logs'] %} + {% if not log_categories[title]['count'] %} + {% else %} +
    • +
      +

      +
      {{ title }}
      + {% if log_categories[title]['details']|length < log_categories[title]['count'] %} + last {{ log_categories[title]['details']|length }} of {{ log_categories[title]['count'] }} + {% else %} + {{ log_categories[title]['count'] }} in total + {% endif %} +

      + +
      + + {% for detail in log_categories[title]['details'] %} +
      {{ detail }}
      + {% endfor %} +
    • + {% endif %} + {% endfor %} +
    +
    + +
    +
      +

      DEBUG

      + + {% for title, log in latest_matches.items() %} + {% if not log %} + {% else %} +
    • +
      +

      {{ title }}

      + +
      +
      {{ log }}
      +
    • + {% endif %} + {% endfor %} +
    +
    +
  • + + {% if datas|length > 1 %} +
  • +
    +
    +
  • + {% endif %} + +
  • +
    + + +
    +
  • + + {% if crawler_stats %} +
  • +
    + + {% for k, v in crawler_stats.items() %} + + {% endfor %} +
    {{ k.replace('/', ' / ').replace('_', ' ') }}{{ v }}
    +
    +
  • + {% endif %} + + {% if crawler_engine %} +
  • +
    + + {% for k, v in crawler_engine.items() %} + + {% endfor %} +
    {{ k }}{{ v }}
    +
    +
  • + {% endif %} +
+
+ + + + + + + + +{% if datas|length > 1 %} + +{% endif %} + + + + + + + + + +{% endblock %} diff --git a/templates/scrapydweb/task_job_results.html b/templates/scrapydweb/task_job_results.html new file mode 100644 index 00000000..37d01fd3 --- /dev/null +++ b/templates/scrapydweb/task_job_results.html @@ -0,0 +1,142 @@ +{% extends 'base.html' %} + +{% block title %}timer task job results{% endblock %} + +{% block head %} + +{% endblock %} + + +{% block body %} + + + +

+ Tasks + Task #{{ task_id }} {% if task and task.name %}({{ task.name }}){% endif %} + Result #{{ task_result_id }} + {% if task %}{{ task.project }}/{{ task.version }}/{{ task.spider }}/{{ task.jobid }}{% endif %} +

+ +
+ +
+ + + +{% endblock %} diff --git a/templates/scrapydweb/task_results.html b/templates/scrapydweb/task_results.html new file mode 100644 index 00000000..beeb3b1c --- /dev/null +++ b/templates/scrapydweb/task_results.html @@ -0,0 +1,150 @@ +{% extends 'base.html' %} + +{% block title %}timer task results{% endblock %} + +{% block head %} + +{% endblock %} + + +{% block body %} + + + +

+ Tasks + Task #{{ task_id }} {% if task and task.name %}({{ task.name }}){% endif %} + {% if task %}{{ task.project }}/{{ task.version }}/{{ task.spider }}/{{ task.jobid }}{% endif %} +

+ +
+ +
+ + + +{% endblock %} diff --git a/templates/scrapydweb/task_results_with_job.html b/templates/scrapydweb/task_results_with_job.html new file mode 100644 index 00000000..c933482b --- /dev/null +++ b/templates/scrapydweb/task_results_with_job.html @@ -0,0 +1,145 @@ +{% extends 'base.html' %} + +{% block title %}timer task results with job{% endblock %} + +{% block head %} + +{% endblock %} + + +{% block body %} + + + +

+ Tasks + Task #{{ task_id }} {% if task and task.name %}({{ task.name }}){% endif %} + {% if task %}{{ task.project }}/{{ task.version }}/{{ task.spider }}/{{ task.jobid }}{% endif %} +

+{% if task_results.items %} +

[node {{ task_results.items[0].node }}] {{ task_results.items[0].server }}

+{% endif %} + +
+ +
+ + + +{% endblock %} diff --git a/templates/scrapydweb/tasks.html b/templates/scrapydweb/tasks.html new file mode 100644 index 00000000..3a9ac82c --- /dev/null +++ b/templates/scrapydweb/tasks.html @@ -0,0 +1,349 @@ +{% extends 'base.html' %} + +{% block title %}timer tasks{% endblock %} + +{% block head %} + +{% endblock %} + + +{% block body %} + + + +
+

Get the list of timer tasks.

+ + + +
+ + +
+ +
+ +{% if tasks.items|length == 0 %} + +{% endif %} + + + +{% endblock %} diff --git a/templates/scrapydweb/utf8.html b/templates/scrapydweb/utf8.html new file mode 100644 index 00000000..28282b8f --- /dev/null +++ b/templates/scrapydweb/utf8.html @@ -0,0 +1,108 @@ +{% extends 'base.html' %} + +{% block title %}log{% endblock %} + +{% block head %} + + +{% endblock %} + +{% block loader %} +
+{% endblock %} + + +{% block body %} +
+
+
+
+
+
+
+
+ +

PROJECT ({{ project }}), SPIDER ({{ spider }})

+ +
+ + +
+ + +
+
{{ text }}
+
+

PROJECT ({{ project }}), SPIDER ({{ spider }})

+{% if url_refresh %} +Click to refresh +{% endif %} + + + + + + + + + +{% endblock %} diff --git a/templates/scrapydweb/utf8_mobileui.html b/templates/scrapydweb/utf8_mobileui.html new file mode 100644 index 00000000..ac5263c8 --- /dev/null +++ b/templates/scrapydweb/utf8_mobileui.html @@ -0,0 +1,81 @@ +{% extends 'base_mobileui.html' %} + +{% block title %}log{% endblock %} + +{% block head %} + + +{% endblock %} + +{% block loader %} +
+{% endblock %} + + +{% block neck %} +Jobs +Stats +{% endblock %} + + +{% block body %} +
+
+
+
+
+
+
+
+ + +
+

PROJECT ({{ project }})
SPIDER ({{ spider }})

+
{{ text }}
+ +

PROJECT ({{ project }})
SPIDER ({{ spider }})

+{% if url_refresh %} + +Press to refresh + +
+
+{% endif %} +
+ + + + + + +{% endblock %} diff --git a/utils/__init__.py b/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/utils/check_app_config.py b/utils/check_app_config.py new file mode 100644 index 00000000..33f439df --- /dev/null +++ b/utils/check_app_config.py @@ -0,0 +1,617 @@ +# coding: utf-8 +import logging +from multiprocessing.dummy import Pool as ThreadPool +import os +import re + +from ..common import handle_metadata, handle_slash, json_dumps, session +from ..models import create_jobs_table, db +from ..utils.scheduler import scheduler +from ..utils.setup_database import test_database_url_pattern +from ..vars import ( + ALLOWED_SCRAPYD_LOG_EXTENSIONS, + ALERT_TRIGGER_KEYS, + SCHEDULER_STATE_DICT, + STATE_PAUSED, + STATE_RUNNING, + SCHEDULE_ADDITIONAL, + STRICT_NAME_PATTERN, + UA_DICT, + jobs_table_map, +) +from .send_email import send_email +from .sub_process import init_logparser, init_poll + + +logger = logging.getLogger(__name__) + +REPLACE_URL_NODE_PATTERN = re.compile(r"(:\d+/)\d+/") +EMAIL_PATTERN = re.compile(r"^[^@]+@[^@]+\.[^@]+$") +HASH = "#" * 100 +# r'^(?:(?:(.*?)\:)(?:(.*?)@))?(.*?)(?:\:(.*?))?(?:#(.*?))?$' +SCRAPYD_SERVER_PATTERN = re.compile( + r""" + ^ + (?: + (?:(.*?):) # username: + (?:(.*?)@) # password@ + )? + (.*?) # ip + (?::(.*?))? # :port + (?:\#(.*?))? # #group + $ + """, + re.X, +) + + +def check_app_config(config): + def check_assert( + key, + default, + is_instance, + allow_zero=True, + non_empty=False, + containing_type=None, + ): + if is_instance is int: + if allow_zero: + should_be = "a non-negative integer" + else: + should_be = "a positive integer" + else: + should_be = "an instance of %s%s" % ( + is_instance, + " and not empty" if non_empty else "", + ) + + value = config.setdefault(key, default) + kws = dict( + key=key, + should_be=should_be, + containing_type=", containing elements of type %s" % containing_type + if containing_type + else "", + value="'%s'" % value if isinstance(value, str) else value, + ) + to_assert = u"{key} should be {should_be}{containing_type}. Current value: {value}".format( + **kws + ) + + assert ( + isinstance(value, is_instance) + and ( + not isinstance(value, bool) if is_instance is int else True + ) # isinstance(True, int) => True + and (value > (-1 if allow_zero else 0) if is_instance is int else True) + and (value if non_empty else True) + and ( + all([isinstance(i, containing_type) for i in value]) + if containing_type + else True + ) + ), to_assert + + logger.debug("Checking app config") + + # ScrapydWeb + check_assert("SCRAPYDWEB_BIND", "0.0.0.0", str, non_empty=True) + SCRAPYDWEB_PORT = config.setdefault("SCRAPYDWEB_PORT", 5000) + try: + assert not isinstance(SCRAPYDWEB_PORT, bool) + SCRAPYDWEB_PORT = int(SCRAPYDWEB_PORT) + assert SCRAPYDWEB_PORT > 0 + except (TypeError, ValueError, AssertionError): + assert False, ( + "SCRAPYDWEB_PORT should be a positive integer. Current value: %s" + % SCRAPYDWEB_PORT + ) + + check_assert("ENABLE_AUTH", False, bool) + if config.get("ENABLE_AUTH", False): + # May be int from config file + check_assert("USERNAME", "", str, non_empty=True) + check_assert("PASSWORD", "", str, non_empty=True) + handle_metadata("username", config["USERNAME"]) + handle_metadata("password", config["PASSWORD"]) + logger.info( + "Basic auth enabled with USERNAME/PASSWORD: '%s'/'%s'", + config["USERNAME"], + config["PASSWORD"], + ) + + check_assert("ENABLE_HTTPS", False, bool) + if config.get("ENABLE_HTTPS", False): + logger.info("HTTPS mode enabled: ENABLE_HTTPS = %s", config["ENABLE_HTTPS"]) + for k in ["CERTIFICATE_FILEPATH", "PRIVATEKEY_FILEPATH"]: + check_assert(k, "", str, non_empty=True) + assert os.path.isfile(config[k]), "%s not found: %s" % (k, config[k]) + logger.info( + "Running in HTTPS mode: %s, %s", + config["CERTIFICATE_FILEPATH"], + config["PRIVATEKEY_FILEPATH"], + ) + + _protocol = "https" if config.get("ENABLE_HTTPS", False) else "http" + _bind = config.get("SCRAPYDWEB_BIND", "0.0.0.0") + _bind = "127.0.0.1" if _bind == "0.0.0.0" else _bind + config["URL_SCRAPYDWEB"] = "%s://%s:%s" % ( + _protocol, + _bind, + config.get("SCRAPYDWEB_PORT", 5000), + ) + handle_metadata("url_scrapydweb", config["URL_SCRAPYDWEB"]) + logger.info("Setting up URL_SCRAPYDWEB: %s", config["URL_SCRAPYDWEB"]) + + # Scrapy + check_assert("SCRAPY_PROJECTS_DIR", "", str) + SCRAPY_PROJECTS_DIR = config.get("SCRAPY_PROJECTS_DIR", "") + if SCRAPY_PROJECTS_DIR: + assert os.path.isdir(SCRAPY_PROJECTS_DIR), ( + "SCRAPY_PROJECTS_DIR not found: %s" % SCRAPY_PROJECTS_DIR + ) + logger.info( + "Setting up SCRAPY_PROJECTS_DIR: %s", handle_slash(SCRAPY_PROJECTS_DIR) + ) + + # Scrapyd + check_scrapyd_servers(config) + # For JobsView + for node, scrapyd_server in enumerate(config["SCRAPYD_SERVERS"], 1): + # Note that check_app_config() is executed multiple times in test + if node not in jobs_table_map: + jobs_table_map[node] = create_jobs_table( + re.sub(STRICT_NAME_PATTERN, "_", scrapyd_server) + ) + db.create_all(bind="jobs") + logger.debug("Created %s tables for JobsView", len(jobs_table_map)) + + check_assert("LOCAL_SCRAPYD_LOGS_DIR", "", str) + check_assert("LOCAL_SCRAPYD_SERVER", "", str) + LOCAL_SCRAPYD_LOGS_DIR = config.get("LOCAL_SCRAPYD_LOGS_DIR", "") + if LOCAL_SCRAPYD_LOGS_DIR: + assert os.path.isdir(LOCAL_SCRAPYD_LOGS_DIR), ( + "LOCAL_SCRAPYD_LOGS_DIR not found: %s" % LOCAL_SCRAPYD_LOGS_DIR + ) + logger.info( + "Setting up LOCAL_SCRAPYD_LOGS_DIR: %s", + handle_slash(LOCAL_SCRAPYD_LOGS_DIR), + ) + LOCAL_SCRAPYD_SERVER = config.get("LOCAL_SCRAPYD_SERVER", "") + if LOCAL_SCRAPYD_SERVER and not re.search(r":\d+$", LOCAL_SCRAPYD_SERVER): + LOCAL_SCRAPYD_SERVER += ":6800" + config["LOCAL_SCRAPYD_SERVER"] = LOCAL_SCRAPYD_SERVER + if len(config["SCRAPYD_SERVERS"]) > 1: + assert LOCAL_SCRAPYD_SERVER, ( + "The LOCAL_SCRAPYD_SERVER option must be set up since you have added multiple Scrapyd servers " + "and set up the LOCAL_SCRAPYD_LOGS_DIR option.\nOtherwise, just set LOCAL_SCRAPYD_LOGS_DIR to ''" + ) + else: + if not LOCAL_SCRAPYD_SERVER: + config["LOCAL_SCRAPYD_SERVER"] = config["SCRAPYD_SERVERS"][0] + LOCAL_SCRAPYD_SERVER = config["LOCAL_SCRAPYD_SERVER"] + logger.info("Setting up LOCAL_SCRAPYD_SERVER: %s", LOCAL_SCRAPYD_SERVER) + assert LOCAL_SCRAPYD_SERVER in config["SCRAPYD_SERVERS"], ( + "LOCAL_SCRAPYD_SERVER '%s' is not in the Scrapyd servers you have added:\n%s" + % (LOCAL_SCRAPYD_SERVER, config["SCRAPYD_SERVERS"]) + ) + # else: + # _path = os.path.join(os.path.expanduser('~'), 'logs') + # if os.path.isdir(_path): + # config['LOCAL_SCRAPYD_LOGS_DIR'] = _path + # logger.info("Found LOCAL_SCRAPYD_LOGS_DIR: %s", config['LOCAL_SCRAPYD_LOGS_DIR']) + + check_assert( + "SCRAPYD_LOG_EXTENSIONS", + ALLOWED_SCRAPYD_LOG_EXTENSIONS, + list, + non_empty=True, + containing_type=str, + ) + SCRAPYD_LOG_EXTENSIONS = config.get( + "SCRAPYD_LOG_EXTENSIONS", ALLOWED_SCRAPYD_LOG_EXTENSIONS + ) + assert all([not i or i.startswith(".") for i in SCRAPYD_LOG_EXTENSIONS]), ( + "SCRAPYD_LOG_EXTENSIONS should be a list like %s. " + "Current value: %s" % (ALLOWED_SCRAPYD_LOG_EXTENSIONS, SCRAPYD_LOG_EXTENSIONS) + ) + logger.info( + "Locating scrapy logfiles with SCRAPYD_LOG_EXTENSIONS: %s", + SCRAPYD_LOG_EXTENSIONS, + ) + + # LogParser + check_assert("ENABLE_LOGPARSER", False, bool) + if config.get("ENABLE_LOGPARSER", False): + assert config.get("LOCAL_SCRAPYD_LOGS_DIR", ""), ( + "In order to automatically run LogParser at startup, you have to set up the LOCAL_SCRAPYD_LOGS_DIR option " + "first.\nOtherwise, set 'ENABLE_LOGPARSER = False' if you are not running any Scrapyd service " + "on the current ScrapydWeb host.\nNote that you can run the LogParser service separately " + "via command 'logparser' as you like. " + ) + check_assert("BACKUP_STATS_JSON_FILE", True, bool) + + # Run Spider + check_assert("SCHEDULE_EXPAND_SETTINGS_ARGUMENTS", False, bool) + check_assert("SCHEDULE_CUSTOM_USER_AGENT", "", str) + check_assert("LONG_RUNNING_SCRAPER_STOP_INTERVAL", 300, int) + config["SCHEDULE_CUSTOM_USER_AGENT"] = ( + config["SCHEDULE_CUSTOM_USER_AGENT"] or "Mozilla/5.0" + ) + UA_DICT.update(custom=config["SCHEDULE_CUSTOM_USER_AGENT"]) + if config.get("SCHEDULE_USER_AGENT", None) is not None: + check_assert("SCHEDULE_USER_AGENT", "", str) + user_agent = config["SCHEDULE_USER_AGENT"] + assert user_agent in UA_DICT.keys(), ( + "SCHEDULE_USER_AGENT should be any value of %s. Current value: %s" + % (UA_DICT.keys(), user_agent) + ) + if config.get("SCHEDULE_ROBOTSTXT_OBEY", None) is not None: + check_assert("SCHEDULE_ROBOTSTXT_OBEY", False, bool) + if config.get("SCHEDULE_COOKIES_ENABLED", None) is not None: + check_assert("SCHEDULE_COOKIES_ENABLED", False, bool) + if config.get("SCHEDULE_CONCURRENT_REQUESTS", None) is not None: + check_assert("SCHEDULE_CONCURRENT_REQUESTS", 16, int, allow_zero=False) + if config.get("SCHEDULE_DOWNLOAD_DELAY", None) is not None: + download_delay = config["SCHEDULE_DOWNLOAD_DELAY"] + if isinstance(download_delay, float): + assert download_delay >= 0.0, ( + "SCHEDULE_DOWNLOAD_DELAY should a non-negative number. Current value: %s" + % download_delay + ) + else: + check_assert("SCHEDULE_DOWNLOAD_DELAY", 0, int) + check_assert("SCHEDULE_ADDITIONAL", SCHEDULE_ADDITIONAL, str) + + # Page Display + check_assert("SHOW_SCRAPYD_ITEMS", True, bool) + check_assert("SHOW_JOBS_JOB_COLUMN", False, bool) + check_assert("JOBS_FINISHED_JOBS_LIMIT", 0, int) + check_assert("JOBS_RELOAD_INTERVAL", 300, int) + check_assert("DAEMONSTATUS_REFRESH_INTERVAL", 10, int) + + # Send text + check_assert("SLACK_TOKEN", "", str) + check_assert("SLACK_CHANNEL", "", str) + config["SLACK_CHANNEL"] = config["SLACK_CHANNEL"] or "general" + + check_assert("TELEGRAM_TOKEN", "", str) + check_assert("TELEGRAM_CHAT_ID", 0, int) + + check_assert("EMAIL_PASSWORD", "", str) + if config.get("EMAIL_PASSWORD", ""): + logger.debug("Found EMAIL_PASSWORD, checking email settings") + check_assert("EMAIL_SUBJECT", "", str) + check_assert( + "EMAIL_USERNAME", "", str + ) # '' would default to config['EMAIL_SENDER'] + # check_assert('EMAIL_PASSWORD', '', str, non_empty=True) + check_assert("EMAIL_SENDER", "", str, non_empty=True) + EMAIL_SENDER = config["EMAIL_SENDER"] + assert re.search(EMAIL_PATTERN, EMAIL_SENDER), ( + "EMAIL_SENDER should contain '@', like 'username@gmail.com'. Current value: %s" + % EMAIL_SENDER + ) + check_assert("EMAIL_RECIPIENTS", [], list, non_empty=True, containing_type=str) + EMAIL_RECIPIENTS = config["EMAIL_RECIPIENTS"] + assert all([re.search(EMAIL_PATTERN, i) for i in EMAIL_RECIPIENTS]), ( + "All elements in EMAIL_RECIPIENTS should contain '@', like 'username@gmail.com'. " + "Current value: %s" + ) % EMAIL_RECIPIENTS + if not config.get("EMAIL_USERNAME", ""): + config["EMAIL_USERNAME"] = config["EMAIL_SENDER"] + + check_assert("SMTP_SERVER", "", str, non_empty=True) + check_assert("SMTP_PORT", 0, int, allow_zero=False) + check_assert("SMTP_OVER_SSL", False, bool) + check_assert("SMTP_CONNECTION_TIMEOUT", 30, int, allow_zero=False) + + # Monitor & Alert + check_assert("ENABLE_MONITOR", False, bool) + if config.get("ENABLE_MONITOR", False): + check_assert("POLL_ROUND_INTERVAL", 300, int, allow_zero=False) + check_assert("POLL_REQUEST_INTERVAL", 10, int, allow_zero=False) + + check_assert("ENABLE_SLACK_ALERT", False, bool) + check_assert("ENABLE_TELEGRAM_ALERT", False, bool) + check_assert("ENABLE_EMAIL_ALERT", False, bool) + + # For compatibility with Python 3 using range() + try: + config["ALERT_WORKING_DAYS"] = list(config.get("ALERT_WORKING_DAYS", [])) + except TypeError: + pass + check_assert( + "ALERT_WORKING_DAYS", [], list, non_empty=True, containing_type=int + ) + ALERT_WORKING_DAYS = config["ALERT_WORKING_DAYS"] + assert all( + [not isinstance(i, bool) and i in range(1, 8) for i in ALERT_WORKING_DAYS] + ), ( + "Element in ALERT_WORKING_DAYS should be between 1 and 7. Current value: %s" + % ALERT_WORKING_DAYS + ) + + try: + config["ALERT_WORKING_HOURS"] = list(config.get("ALERT_WORKING_HOURS", [])) + except TypeError: + pass + check_assert( + "ALERT_WORKING_HOURS", [], list, non_empty=True, containing_type=int + ) + ALERT_WORKING_HOURS = config["ALERT_WORKING_HOURS"] + assert all( + [not isinstance(i, bool) and i in range(24) for i in ALERT_WORKING_HOURS] + ), ( + "Element in ALERT_WORKING_HOURS should be between 0 and 23. Current value: %s" + % ALERT_WORKING_HOURS + ) + + check_assert("ON_JOB_RUNNING_INTERVAL", 0, int) + check_assert("ON_JOB_FINISHED", False, bool) + + for k in ALERT_TRIGGER_KEYS: + check_assert("LOG_%s_THRESHOLD" % k, 0, int) + check_assert("LOG_%s_TRIGGER_STOP" % k, False, bool) + check_assert("LOG_%s_TRIGGER_FORCESTOP" % k, False, bool) + + if config.get("ENABLE_SLACK_ALERT", False): + check_assert("SLACK_TOKEN", "", str, non_empty=True) + check_slack_telegram(config, service="slack") + if config.get("ENABLE_TELEGRAM_ALERT", False): + check_assert("TELEGRAM_TOKEN", "", str, non_empty=True) + check_assert("TELEGRAM_CHAT_ID", 0, int, allow_zero=False) + check_slack_telegram(config, service="telegram") + if config.get("ENABLE_EMAIL_ALERT", False): + check_assert("EMAIL_PASSWORD", "", str, non_empty=True) + check_email(config) + + # System + check_assert("DEBUG", False, bool) + check_assert("VERBOSE", False, bool) + # if config.get('VERBOSE', False): + # logging.getLogger('apscheduler').setLevel(logging.DEBUG) + # else: + # logging.getLogger('apscheduler').setLevel(logging.WARNING) + check_assert("DATA_PATH", "", str) + check_assert("DATABASE_URL", "", str) + database_url = config.get("DATABASE_URL", "") + if database_url: + assert any(test_database_url_pattern(database_url)), ( + "Invalid format of DATABASE_URL: %s" % database_url + ) + + # Apscheduler + # In __init__.py create_app(): scheduler.start(paused=True) + if handle_metadata().get("scheduler_state", STATE_RUNNING) != STATE_PAUSED: + scheduler.resume() + logger.info("Scheduler for timer tasks: %s", SCHEDULER_STATE_DICT[scheduler.state]) + + check_assert("JOBS_SNAPSHOT_INTERVAL", 300, int) + JOBS_SNAPSHOT_INTERVAL = config.get("JOBS_SNAPSHOT_INTERVAL", 300) + if JOBS_SNAPSHOT_INTERVAL: + # TODO: with app.app_context(): url = url_for('jobs', node=1) + # Working outside of application context. + # only because before app.run?! + username = config.get("USERNAME", "") + password = config.get("PASSWORD", "") + kwargs = dict( + # 'http(s)://127.0.0.1:5000' + '/1/jobs/' + url_jobs=config["URL_SCRAPYDWEB"] + + handle_metadata().get("url_jobs", "/1/jobs/"), + auth=(username, password) if username and password else None, + nodes=list(range(1, len(config["SCRAPYD_SERVERS"]) + 1)), + ) + logger.info( + scheduler.add_job( + id="jobs_snapshot", + replace_existing=True, + func=create_jobs_snapshot, + args=None, + kwargs=kwargs, + trigger="interval", + seconds=JOBS_SNAPSHOT_INTERVAL, + misfire_grace_time=60, + coalesce=True, + max_instances=1, + jobstore="memory", + ) + ) + + # Subprocess + init_subprocess(config) + + +def create_jobs_snapshot(url_jobs, auth, nodes): + for node in nodes: + url_jobs = re.sub( + REPLACE_URL_NODE_PATTERN, r"\g<1>%s/" % node, url_jobs, count=1 + ) + try: + r = session.post(url_jobs, auth=auth, timeout=60) + assert r.status_code == 200, "Request got status_code: %s" % r.status_code + except Exception as err: + print("Fail to create jobs snapshot: %s\n%s" % (url_jobs, err)) + # else: + # print(url_jobs, r.status_code) + + +def check_scrapyd_servers(config): + SCRAPYD_SERVERS = config.get("SCRAPYD_SERVERS", []) or ["127.0.0.1:6800"] + SCRAPYD_SERVERS_PUBLIC_URLS = config.get("SCRAPYD_SERVERS_PUBLIC_URLS", None) or [ + "" + ] * len(SCRAPYD_SERVERS) + assert len(SCRAPYD_SERVERS_PUBLIC_URLS) == len(SCRAPYD_SERVERS), ( + "SCRAPYD_SERVERS_PUBLIC_URLS should have same length with SCRAPYD_SERVERS:\n%s\nvs.\n%s" + % (SCRAPYD_SERVERS_PUBLIC_URLS, SCRAPYD_SERVERS) + ) + + servers = [] + for idx, (server, public_url) in enumerate( + zip(SCRAPYD_SERVERS, SCRAPYD_SERVERS_PUBLIC_URLS) + ): + if isinstance(server, tuple): + assert len(server) == 5, ( + "Scrapyd server should be a tuple of 5 elements, " + "current value: %s" % str(server) + ) + usr, psw, ip, port, group = server + else: + usr, psw, ip, port, group = re.search( + SCRAPYD_SERVER_PATTERN, server.strip() + ).groups() + ip = ip.strip() if ip and ip.strip() else "127.0.0.1" + port = port.strip() if port and port.strip() else "6800" + group = group.strip() if group and group.strip() else "" + auth = (usr, psw) if usr and psw else None + public_url = public_url.strip(" /") + servers.append((group, ip, port, auth, public_url)) + + def key_func(arg): + _group, _ip, _port, _auth, _public_url = arg + parts = _ip.split(".") + parts = [("0" * (3 - len(part)) + part) for part in parts] + return [_group, ".".join(parts), int(_port)] + + servers = sorted(set(servers), key=key_func) + check_scrapyd_connectivity(servers) + + config["SCRAPYD_SERVERS"] = [ + "%s:%s" % (ip, port) for (group, ip, port, auth, public_url) in servers + ] + config["SCRAPYD_SERVERS_GROUPS"] = [ + group for (group, ip, port, auth, public_url) in servers + ] + config["SCRAPYD_SERVERS_AUTHS"] = [ + auth for (group, ip, port, auth, public_url) in servers + ] + config["SCRAPYD_SERVERS_PUBLIC_URLS"] = [ + public_url for (group, ip, port, auth, public_url) in servers + ] + + +def check_scrapyd_connectivity(servers): + logger.debug("Checking connectivity of SCRAPYD_SERVERS...") + + def check_connectivity(server): + (_group, _ip, _port, _auth, _public_url) = server + try: + url = "http://%s:%s" % (_ip, _port) + r = session.get(url, auth=_auth, timeout=10) + assert r.status_code == 200, "%s got status_code %s" % (url, r.status_code) + except Exception as err: + logger.error(err) + return False + else: + return True + + # with ThreadPool(min(len(servers), 100)) as pool: # Works in python 3.3 and up + # results = pool.map(check_connectivity, servers) + pool = ThreadPool(min(len(servers), 100)) + results = pool.map(check_connectivity, servers) + pool.close() + pool.join() + + print( + "\nIndex {group:<20} {server:<21} Connectivity Auth".format( + group="Group", server="Scrapyd IP:Port" + ) + ) + print(HASH) + for idx, ((group, ip, port, auth, public_url), result) in enumerate( + zip(servers, results), 1 + ): + print( + "{idx:_<5} {group:_<20} {server:_<22} {result:_<11} {auth}".format( + idx=idx, + group=group or "None", + server="%s:%s" % (ip, port), + auth=auth, + result=str(result), + ) + ) + print(HASH + "\n") + + assert any(results), "None of your SCRAPYD_SERVERS could be connected. " + + +def check_slack_telegram(config, service): + logger.debug("Trying to send %s..." % service) + text = "%s alert enabled #scrapydweb" % service.capitalize() + alert = ( + "Fail to send text via %s, you may need to set 'ENABLE_%s_ALERT = False'" + % (service.capitalize(), service.upper()) + ) + if service == "slack": + url = "https://slack.com/api/chat.postMessage" + data = dict( + token=config["SLACK_TOKEN"], channel=config["SLACK_CHANNEL"], text=text + ) + else: + url = "https://api.telegram.org/bot%s/sendMessage" % config["TELEGRAM_TOKEN"] + data = dict(chat_id=config["TELEGRAM_CHAT_ID"], text=text) + r = None + try: + r = session.post(url, data=data, timeout=30) + js = r.json() + assert r.status_code == 200 and js["ok"] is True + except Exception as err: + logger.error(err) + logger.error("url: %s", url) + logger.error("data: %s", data) + if r is not None: + logger.error("status_code: %s", r.status_code) + logger.error("response: %s", r.text) + assert False, alert + else: + logger.info(text) + + +def check_email(config): + kwargs = dict( + email_username=config["EMAIL_USERNAME"], + email_password=config["EMAIL_PASSWORD"], + email_sender=config["EMAIL_SENDER"], + email_recipients=config["EMAIL_RECIPIENTS"], + smtp_server=config["SMTP_SERVER"], + smtp_port=config["SMTP_PORT"], + smtp_over_ssl=config.get("SMTP_OVER_SSL", False), + smtp_connection_timeout=config.get("SMTP_CONNECTION_TIMEOUT", 30), + ) + kwargs["to_retry"] = True + kwargs["subject"] = "Email alert enabled #scrapydweb" + kwargs["content"] = json_dumps( + dict( + EMAIL_SENDER=config["EMAIL_SENDER"], + EMAIL_RECIPIENTS=config["EMAIL_RECIPIENTS"], + ) + ) + + logger.debug( + "Trying to send email (smtp_connection_timeout=%s)...", + config.get("SMTP_CONNECTION_TIMEOUT", 30), + ) + result, reason = send_email(**kwargs) + if not result and os.environ.get("TEST_ON_CIRCLECI", "False") == "False": + logger.debug( + "kwargs for send_email():\n%s", json_dumps(kwargs, sort_keys=False) + ) + assert ( + result + ), "Fail to send email. Modify the email settings above or set 'ENABLE_EMAIL_ALERT = False'" + logger.info(kwargs["subject"]) + + +def init_subprocess(config): + if config.get("ENABLE_LOGPARSER", False): + config["LOGPARSER_PID"] = init_logparser(config) + else: + config["LOGPARSER_PID"] = None + handle_metadata("logparser_pid", config["LOGPARSER_PID"]) + + if config.get("ENABLE_MONITOR", False): + config["POLL_PID"] = init_poll(config) + else: + config["POLL_PID"] = None + handle_metadata("poll_pid", config["POLL_PID"]) + diff --git a/utils/poll.py b/utils/poll.py new file mode 100644 index 00000000..f864271c --- /dev/null +++ b/utils/poll.py @@ -0,0 +1,249 @@ +# coding: utf-8 +import json +import logging +import os +import platform +import re +import sys +import time +import traceback + +try: + from psutil import pid_exists +except ImportError: + pid_exists = None + +import requests +from requests.adapters import HTTPAdapter + + +logger = logging.getLogger('scrapydweb.utils.poll') # __name__ +_handler = logging.StreamHandler() +_formatter = logging.Formatter(fmt="[%(asctime)s] %(levelname)-8s in %(name)s: %(message)s") +_handler.setFormatter(_formatter) +logger.addHandler(_handler) + +IN_WINDOWS = platform.system() == 'Windows' +JOB_PATTERN = re.compile(r""" + + (?P.*?) + (?P.*?) + (?P.*?) + (?:(?P.*?))? + (?:(?P.*?))? + (?:(?P.*?))? + (?:(?P.*?))? + (?:(?P.*?))? + (?:(?P.*?))? + [\w\W]*? # Temp support for Scrapyd v1.3.0 (not released) + + """, re.X) +JOB_KEYS = ['project', 'spider', 'job', 'pid', 'start', 'runtime', 'finish', 'log', 'items'] + + +class Poll(object): + logger = logger + + def __init__(self, url_scrapydweb, username, password, + scrapyd_servers, scrapyd_servers_auths, + poll_round_interval, poll_request_interval, + main_pid, verbose, exit_timeout=0): + self.url_scrapydweb = url_scrapydweb + self.auth = (username, password) if username and password else None + + self.scrapyd_servers = scrapyd_servers + self.scrapyd_servers_auths = scrapyd_servers_auths + + self.session = requests.Session() + self.session.mount('http://', HTTPAdapter(pool_connections=1000, pool_maxsize=1000)) + self.session.mount('https://', HTTPAdapter(pool_connections=1000, pool_maxsize=1000)) + # if username and password: + # self.session.auth = (username, password) + self.timeout = 60 + + self.poll_round_interval = poll_round_interval + self.poll_request_interval = poll_request_interval + + self.ignore_finished_bool_list = [True] * len(self.scrapyd_servers) + self.finished_jobs_dict = {} + + self.main_pid = main_pid + self.poll_pid = os.getpid() + + if verbose: + self.logger.setLevel(logging.DEBUG) + else: + self.logger.setLevel(logging.INFO) + self.exit_timeout = exit_timeout + + self.init_time = time.time() + self.url_stats = self.url_scrapydweb + '/{node}/log/{opt}/{project}/{spider}/{job}/?job_finished={job_finished}' + + def check_exit(self): + exit_condition_1 = pid_exists is not None and not pid_exists(self.main_pid) + exit_condition_2 = not IN_WINDOWS and not self.check_pid(self.main_pid) + if exit_condition_1 or exit_condition_2: + sys.exit("!!! Poll subprocess (pid: %s) exits " + "since main_pid %s not exists" % (self.poll_pid, self.main_pid)) + + # https://stackoverflow.com/questions/568271/how-to-check-if-there-exists-a-process-with-a-given-pid-in-python + @staticmethod + def check_pid(pid): + """ Check For the existence of a unix pid. """ + try: + os.kill(pid, 0) + except OSError: + return False + else: + return True + + def fetch_jobs(self, node, url, auth): + running_jobs = [] + finished_jobs_set = set() + self.logger.debug("[node %s] fetch_jobs: %s", node, url) + r = self.make_request(url, auth=auth, post=False) + # Should not invoke update_finished_jobs() if fail to fetch jobs + assert r is not None, "[node %s] fetch_jobs failed: %s" % (node, url) + + self.logger.debug("[node %s] fetch_jobs got (%s) %s bytes", node, r.status_code, len(r.content)) + # Temp support for Scrapyd v1.3.0 (not released) + text = re.sub(r'.*?', '', r.text, flags=re.S) + jobs = [dict(zip(JOB_KEYS, job)) for job in re.findall(JOB_PATTERN, text)] + for job in jobs: + job_tuple = (job['project'], job['spider'], job['job']) + if job['pid']: + running_jobs.append(job_tuple) + elif job['finish']: + finished_jobs_set.add(job_tuple) + self.logger.debug("[node %s] got running_jobs: %s", node, len(running_jobs)) + self.logger.debug("[node %s] got finished_jobs_set: %s", node, len(finished_jobs_set)) + return running_jobs, finished_jobs_set + + def fetch_stats(self, node, job_tuple, finished_jobs): + (project, spider, job) = job_tuple + job_finished = 'True' if job_tuple in finished_jobs else '' + kwargs = dict( + node=node, + opt='stats', + project=project, + spider=spider, + job=job, + job_finished=job_finished + ) + # http://127.0.0.1:5000/log/stats/proxy/test/55f1f388a7ae11e8b9b114dda9e91c2f/ + url = self.url_stats.format(**kwargs) + self.logger.debug("[node %s] fetch_stats: %s", node, url) + # Make POST request to trigger alert, see log.py + r = self.make_request(url, auth=self.auth, post=True) + if r is None: + self.logger.error("[node %s %s] fetch_stats failed: %s", node, self.scrapyd_servers[node-1], url) + if job_finished: + self.finished_jobs_dict[node].remove(job_tuple) + self.logger.info("[node %s] retry in next round: %s", node, url) + else: + self.logger.debug("[node %s] fetch_stats got (%s) %s bytes from %s", + node, r.status_code, len(r.content), url) + + def main(self): + while True: + self.check_exit() + start_time = time.time() + try: + self.run() + end_time = time.time() + self.logger.debug("Took %.1f seconds", (end_time - start_time)) + if 0 < self.exit_timeout < end_time - self.init_time: + self.logger.critical("GoodBye, exit_timeout: %s", self.exit_timeout) + break + else: + self.logger.info("Sleeping for %ss", self.poll_round_interval) + time.sleep(self.poll_round_interval) + except KeyboardInterrupt: + self.logger.warning("Poll subprocess (pid: %s) cancelled by KeyboardInterrupt", self.poll_pid) + sys.exit() + except Exception: + self.logger.error(traceback.format_exc()) + + def make_request(self, url, auth, post=False): + try: + if post: + r = self.session.post(url, auth=auth, timeout=self.timeout) + else: + r = self.session.get(url, auth=auth, timeout=self.timeout) + r.encoding = 'utf-8' + assert r.status_code == 200, "got status_code %s" % r.status_code + except Exception as err: + self.logger.error("make_request failed: %s\n%s", url, err) + return None + else: + return r + + def run(self): + for node, (scrapyd_server, auth) in enumerate(zip(self.scrapyd_servers, self.scrapyd_servers_auths), 1): + # Update Jobs history + # url_jobs = self.url_scrapydweb + '/%s/jobs/' % node + # self.make_request(url_jobs, auth=self.auth, post=True) + + url_jobs = 'http://%s/jobs' % scrapyd_server + # json.loads(json.dumps({'auth':(1,2)})) => {'auth': [1, 2]} + auth = tuple(auth) if auth else None # TypeError: 'list' object is not callable + try: + running_jobs, finished_jobs_set = self.fetch_jobs(node, url_jobs, auth) + finished_jobs = self.update_finished_jobs(node, finished_jobs_set) + for job_tuple in running_jobs + finished_jobs: + self.fetch_stats(node, job_tuple, finished_jobs) + self.logger.debug("Sleeping for %ss", self.poll_request_interval) + time.sleep(self.poll_request_interval) + except KeyboardInterrupt: + raise + except AssertionError as err: + self.logger.error(err) + except Exception: + self.logger.error(traceback.format_exc()) + + def update_finished_jobs(self, node, finished_jobs_set): + finished_jobs_set_previous = self.finished_jobs_dict.setdefault(node, set()) + self.logger.debug("[node %s] previous finished_jobs_set: %s", node, len(finished_jobs_set_previous)) + # set([2,3]).difference(set([1,2])) => {3} + finished_jobs_set_new_added = finished_jobs_set.difference(finished_jobs_set_previous) + self.finished_jobs_dict[node] = finished_jobs_set + self.logger.debug("[node %s] now finished_jobs_set: %s", node, len(self.finished_jobs_dict[node])) + if finished_jobs_set_new_added: + self.logger.info("[node %s] new added finished_jobs_set: %s", node, finished_jobs_set_new_added) + else: + self.logger.debug("[node %s] new added finished_jobs_set: %s", node, finished_jobs_set_new_added) + + finished_jobs = [] + ignore = self.ignore_finished_bool_list[node-1] + for job_tuple in finished_jobs_set_new_added: + if ignore: + self.logger.debug("[node %s] ignore finished job: %s", node, job_tuple) + else: + finished_jobs.append(job_tuple) + if ignore: + self.ignore_finished_bool_list[node-1] = False + self.logger.debug("[node %s] new added finished_jobs after filter: %s", node, len(finished_jobs)) + return finished_jobs + + +def main(args): + keys = ('url_scrapydweb', 'username', 'password', + 'scrapyd_servers', 'scrapyd_servers_auths', + 'poll_round_interval', 'poll_request_interval', + 'main_pid', 'verbose', 'exit_timeout') + kwargs = dict(zip(keys, args)) + kwargs['scrapyd_servers'] = json.loads(kwargs['scrapyd_servers']) + kwargs['scrapyd_servers_auths'] = json.loads(kwargs['scrapyd_servers_auths']) + kwargs['poll_round_interval'] = int(kwargs['poll_round_interval']) + kwargs['poll_request_interval'] = int(kwargs['poll_request_interval']) + kwargs['main_pid'] = int(kwargs['main_pid']) + kwargs['verbose'] = kwargs['verbose'] == 'True' + kwargs['exit_timeout'] = int(kwargs.setdefault('exit_timeout', 0)) # For test only + + poll = Poll(**kwargs) + poll.main() + return poll.ignore_finished_bool_list # For test only + + +if __name__ == '__main__': + main(sys.argv[1:]) diff --git a/utils/scheduler.py b/utils/scheduler.py new file mode 100644 index 00000000..5ca94fed --- /dev/null +++ b/utils/scheduler.py @@ -0,0 +1,111 @@ +# coding: utf-8 +import atexit +import logging +from pprint import pformat + +from apscheduler.events import EVENT_JOB_MAX_INSTANCES, EVENT_JOB_REMOVED +from apscheduler.executors.pool import ThreadPoolExecutor # , ProcessPoolExecutor +from apscheduler.jobstores.sqlalchemy import SQLAlchemyJobStore +from apscheduler.jobstores.memory import MemoryJobStore +from apscheduler.schedulers.background import BackgroundScheduler + +from ..common import handle_metadata +from ..vars import APSCHEDULER_DATABASE_URI, TIMER_TASKS_HISTORY_LOG + + +apscheduler_logger = logging.getLogger('apscheduler') +# _handler = logging.StreamHandler() +# logging.FileHandler(filename, mode='a', encoding=None, delay=False) +_handler = logging.FileHandler(TIMER_TASKS_HISTORY_LOG, mode='a', encoding='utf-8') +_handler.setLevel(logging.WARNING) +_formatter = logging.Formatter(fmt="[%(asctime)s] %(levelname)s in %(name)s: %(message)s") +_handler.setFormatter(_formatter) +apscheduler_logger.addHandler(_handler) + + +# EVENT_JOB_REMOVED = 2 ** 10 +# {'alias': None, 'code': 1024, 'job_id': '1', 'jobstore': 'default'} +# EVENT_JOB_MAX_INSTANCES = 2 ** 16 +EVENT_MAP = {EVENT_JOB_MAX_INSTANCES: 'EVENT_JOB_MAX_INSTANCES', EVENT_JOB_REMOVED: 'EVENT_JOB_REMOVED'} + +jobstores = { + 'default': SQLAlchemyJobStore(url=APSCHEDULER_DATABASE_URI), + 'memory': MemoryJobStore() +} +executors = { + 'default': ThreadPoolExecutor(20), + # 'processpool': ProcessPoolExecutor(5) +} +job_defaults = { + 'coalesce': True, + 'max_instances': 1 +} +# https://apscheduler.readthedocs.io/en/latest/userguide.html +# scheduler = BackgroundScheduler(jobstores=jobstores, executors=executors, job_defaults=job_defaults, timezone=utc) +scheduler = BackgroundScheduler(jobstores=jobstores, executors=executors, job_defaults=job_defaults) + + +# https://apscheduler.readthedocs.io/en/latest/userguide.html#scheduler-events +# EVENT_JOB_EXECUTED: 'code': 4096, 'exception': None +# EVENT_JOB_ERROR: 'code': 8192, 'exception': xxx +# apscheduler/executors/base.py + # events.append(JobExecutionEvent(EVENT_JOB_MISSED, job.id, jobstore_alias, + # run_time)) + # logger.warning('Run time of job "%s" was missed by %s', job, difference) +# WARNING in apscheduler.executors.default: Run time of job "task_1" was missed by 0:00:26.030600 +# apscheduler/schedulers/base.py +# self._logger = maybe_ref(config.pop('logger', None)) or getLogger('apscheduler.scheduler') +# self._logger.warning( +# 'Execution of job "%s" skipped: maximum number of running ' +# 'instances reached (%d)', job, job.max_instances) +# event = JobSubmissionEvent(EVENT_JOB_MAX_INSTANCES, job.id, +# jobstore_alias, run_times) +# events.append(event) + +# EVENT_JOB_MAX_INSTANCES: 'job_id': 'jobs_snapshot', 'jobstore': 'memory', +# WARNING in apscheduler.scheduler: Execution of job "create_jobs_snapshot (trigger: interval[0:00:10], +# next run at: " skipped: maximum number of running instances reached (1) +def my_listener(event): + msg = "%s: \n%s\n" % (EVENT_MAP[event.code], pformat(vars(event), indent=4)) + # logger defined outside the callback of add_listener does not take effect?! + # In case JOBS_SNAPSHOT_INTERVAL is set too short, like 10 seconds + if event.jobstore != 'default': + logging.getLogger('apscheduler').info(msg) + else: + logging.getLogger('apscheduler').warning(msg) + + # if hasattr(event, 'exception') and event.exception: + # print(event.exception) + # if hasattr(event, 'traceback') and event.traceback: + # print(event.traceback) + + +# To trigger EVENT_JOB_MAX_INSTANCES: +# add sleep in execute_task() +# second: */10, max_instances: 2 +# EVENT_JOB_ERROR and EVENT_JOB_MISSED are caught by logging.FileHandler +scheduler.add_listener(my_listener, EVENT_JOB_MAX_INSTANCES | EVENT_JOB_REMOVED) + +# if scheduler.state == STATE_STOPPED: +scheduler.start(paused=True) + + +def shutdown_scheduler(): + apscheduler_logger.debug("Scheduled tasks: %s", scheduler.get_jobs()) + apscheduler_logger.warning("Shutting down the scheduler for timer tasks gracefully, " + "wait until all currently executing tasks are finished") + apscheduler_logger.warning("The main pid is %s. Kill it manually if you don't want to wait", + handle_metadata().get('main_pid')) + scheduler.shutdown() + # apscheduler_logger.info("Waits until all currently executing jobs are finished. " + # "Press CTRL+C to force unclean shutdown") + # try: + # scheduler.shutdown() + # except KeyboardInterrupt: + # apscheduler_logger.warning("Forcing unclean shutdown") + # scheduler.shutdown(wait=False) + # apscheduler_logger.info("Good Bye") + + +# https://stackoverflow.com/questions/21214270/scheduling-a-function-to-run-every-hour-on-flask +atexit.register(lambda: shutdown_scheduler()) diff --git a/utils/send_email.py b/utils/send_email.py new file mode 100644 index 00000000..982acc8c --- /dev/null +++ b/utils/send_email.py @@ -0,0 +1,95 @@ +# coding: utf-8 +from collections import OrderedDict +from email.mime.text import MIMEText +import json +import logging +import smtplib +import sys +import time + + +logger = logging.getLogger('scrapydweb.utils.send_email') # __name__ +logger.setLevel(logging.DEBUG) + + +# https://stackoverflow.com/a/27515833/10517783 How to send an email with Gmail as provider using Python? +# https://stackoverflow.com/a/26053352/10517783 Python smtplib proxy support +def send_email(**kwargs): + to_retry = kwargs.get('to_retry', False) + need_debug = kwargs.get('need_debug', False) + + email_username = kwargs['email_username'] + email_password = kwargs['email_password'] + email_sender = kwargs['email_sender'] + email_recipients = kwargs['email_recipients'] + smtp_server = kwargs['smtp_server'] + smtp_port = kwargs['smtp_port'] + smtp_over_ssl = kwargs['smtp_over_ssl'] + smtp_connection_timeout = kwargs['smtp_connection_timeout'] + subject = kwargs['subject'] + content = kwargs['content'] + # https://stackoverflow.com/questions/6921699/can-i-get-json-to-load-into-an-ordereddict/6921760#6921760 + # data = json.loads('{"foo":1, "bar": 2}', object_pairs_hook=OrderedDict) + # In log.py : ensure_ascii=True + # json.loads('abc') -> JSONDecodeError + try: + content = json.dumps(json.loads(content, object_pairs_hook=OrderedDict), + sort_keys=False, indent=4, ensure_ascii=False) + except ValueError: + pass + + msg = MIMEText(u'%s\n%s' % (time.ctime(), content), 'plain', 'utf-8') + msg['From'] = email_sender + msg['Subject'] = u'{} {}'.format(time.strftime('%H:%M'), subject) + + server = None + result = False + reason = '' + try: + if smtp_over_ssl: + server = smtplib.SMTP_SSL(smtp_server, smtp_port, timeout=smtp_connection_timeout) + else: + server = smtplib.SMTP(smtp_server, smtp_port, timeout=smtp_connection_timeout) + server.ehlo() + server.starttls() + if need_debug: + server.set_debuglevel(1) # For debug + server.login(email_username, email_password) + server.sendmail(email_sender, email_recipients, msg.as_string()) + except Exception as err: + logger.error("Fail to send email: %s", subject) + try: + reason = err.args[-1].decode('utf8') + except: + try: + reason = err.args[-1].decode('gbk') + except: + reason = str(err) + logger.info("Fail reason: %s", reason) + if to_retry: + kwargs.update(to_retry=False, need_debug=True) + logger.debug("Retrying...") + time.sleep(3) + return send_email(**kwargs) + else: + result = True + reason = "Sent" + logger.info("Email sent: %s", subject) + finally: + if server is not None: + try: + server.quit() + except: + pass + + return result, reason + + +if __name__ == '__main__': + # To avoid logging twice when importing the send_email function to send email. + _handler = logging.StreamHandler() + _formatter = logging.Formatter(fmt="[%(asctime)s] %(levelname)-8s in %(name)s: %(message)s") + _handler.setFormatter(_formatter) + logger.addHandler(_handler) + + send_email(**json.loads(sys.argv[1])) diff --git a/utils/setup_database.py b/utils/setup_database.py new file mode 100644 index 00000000..0d2523e0 --- /dev/null +++ b/utils/setup_database.py @@ -0,0 +1,168 @@ +# coding: utf-8 +import os +import re +import sys + + +DB_APSCHEDULER = 'scrapydweb_apscheduler' +DB_TIMERTASKS = 'scrapydweb_timertasks' +DB_METADATA = 'scrapydweb_metadata' +DB_JOBS = 'scrapydweb_jobs' +DBS = [DB_APSCHEDULER, DB_TIMERTASKS, DB_METADATA, DB_JOBS] + +PATTERN_MYSQL = re.compile(r'mysql://(.+?)(?::(.+?))?@(.+?):(\d+)') +PATTERN_POSTGRESQL = re.compile(r'postgres://(.+?)(?::(.+?))?@(.+?):(\d+)') +PATTERN_SQLITE = re.compile(r'sqlite:///(.+)$') + +SCRAPYDWEB_TESTMODE = os.environ.get('SCRAPYDWEB_TESTMODE', 'False').lower() == 'true' + + +def test_database_url_pattern(database_url): + m_mysql = PATTERN_MYSQL.match(database_url) + m_postgres = PATTERN_POSTGRESQL.match(database_url) + m_sqlite = PATTERN_SQLITE.match(database_url) + return m_mysql, m_postgres, m_sqlite + + +def setup_database(database_url, database_path): + database_url = re.sub(r'\\', '/', database_url) + database_url = re.sub(r'/$', '', database_url) + database_path = re.sub(r'\\', '/', database_path) + database_path = re.sub(r'/$', '', database_path) + + m_mysql, m_postgres, m_sqlite = test_database_url_pattern(database_url) + if m_mysql: + setup_mysql(*m_mysql.groups()) + elif m_postgres: + setup_postgresql(*m_postgres.groups()) + else: + database_path = m_sqlite.group(1) if m_sqlite else database_path + database_path = os.path.abspath(database_path) + database_path = re.sub(r'\\', '/', database_path) + database_path = re.sub(r'/$', '', database_path) + if not os.path.isdir(database_path): + os.mkdir(database_path) + + if m_mysql or m_postgres: + APSCHEDULER_DATABASE_URI = '/'.join([database_url, DB_APSCHEDULER]) + SQLALCHEMY_DATABASE_URI = '/'.join([database_url, DB_TIMERTASKS]) + SQLALCHEMY_BINDS = { + 'metadata': '/'.join([database_url, DB_METADATA]), + 'jobs': '/'.join([database_url, DB_JOBS]) + } + else: + # db names for backward compatibility + APSCHEDULER_DATABASE_URI = 'sqlite:///' + '/'.join([database_path, 'apscheduler.db']) + # http://flask-sqlalchemy.pocoo.org/2.3/binds/#binds + SQLALCHEMY_DATABASE_URI = 'sqlite:///' + '/'.join([database_path, 'timer_tasks.db']) + SQLALCHEMY_BINDS = { + 'metadata': 'sqlite:///' + '/'.join([database_path, 'metadata.db']), + 'jobs': 'sqlite:///' + '/'.join([database_path, 'jobs.db']) + } + + if SCRAPYDWEB_TESTMODE: + print("DATABASE_PATH: %s" % database_path) + print("APSCHEDULER_DATABASE_URI: %s" % APSCHEDULER_DATABASE_URI) + print("SQLALCHEMY_DATABASE_URI: %s" % SQLALCHEMY_DATABASE_URI) + print("SQLALCHEMY_BINDS: %s" % SQLALCHEMY_BINDS) + return APSCHEDULER_DATABASE_URI, SQLALCHEMY_DATABASE_URI, SQLALCHEMY_BINDS, database_path + + +def drop_database(cur, dbname): + sql = "DROP DATABASE %s" % dbname + print(sql) + try: + cur.execute(sql) + except Exception as err: + print(err) + + +def setup_mysql(username, password, host, port): + """ + ModuleNotFoundError: No module named 'MySQLdb' + pip install mysqlclient + Python 2: pip install mysqlclient -> MySQLdb/_mysql.c(29) : + fatal error C1083: Cannot open include file: 'mysql.h': No such file or directory + https://stackoverflow.com/questions/51294268/pip-install-mysqlclient-returns-fatal-error-c1083-cannot-open-file-mysql-h + https://www.lfd.uci.edu/~gohlke/pythonlibs/#mysqlclient + pip install "path to the downloaded mysqlclient.whl file" + """ + require_version = '0.9.3' # Dec 18, 2018 + install_command = "pip install --upgrade pymysql" + try: + import pymysql + assert pymysql.__version__ >= require_version, install_command + except (ImportError, AssertionError): + sys.exit("Run command: %s" % install_command) + else: + # Run scrapydweb: ModuleNotFoundError: No module named 'MySQLdb' + pymysql.install_as_MySQLdb() + + conn = pymysql.connect(host=host, port=int(port), user=username, password=password, + charset='utf8', cursorclass=pymysql.cursors.DictCursor) + cur = conn.cursor() + for dbname in DBS: + if SCRAPYDWEB_TESTMODE: + drop_database(cur, dbname) + # pymysql.err.ProgrammingError: (1007, "Can't create database 'scrapydweb_apscheduler'; database exists") + # cur.execute("CREATE DATABASE IF NOT EXISTS %s CHARACTER SET 'utf8' COLLATE 'utf8_general_ci'" % dbname) + try: + cur.execute("CREATE DATABASE %s CHARACTER SET 'utf8' COLLATE 'utf8_general_ci'" % dbname) + except Exception as err: + if 'exists' in str(err): + pass + else: + raise + cur.close() + conn.close() + + +def setup_postgresql(username, password, host, port): + """ + https://github.com/my8100/notes/blob/master/back_end/the-flask-mega-tutorial.md + When working with database servers such as MySQL and PostgreSQL, + you have to create the database in the database server before running upgrade. + """ + require_version = '2.7.7' # Jan 23, 2019 + install_command = "pip install --upgrade psycopg2" + try: + import psycopg2 + assert psycopg2.__version__ >= require_version, install_command + except (ImportError, AssertionError): + sys.exit("Run command: %s" % install_command) + + conn = psycopg2.connect(host=host, port=int(port), user=username, password=password) + conn.set_isolation_level(0) # https://wiki.postgresql.org/wiki/Psycopg2_Tutorial + cur = conn.cursor() + for dbname in DBS: + if SCRAPYDWEB_TESTMODE: + # database "scrapydweb_apscheduler" is being accessed by other users + # DETAIL: There is 1 other session using the database. + # To restart postgres server on Windonws -> win+R: services.msc + drop_database(cur, dbname) + + # https://www.postgresql.org/docs/9.0/sql-createdatabase.html + # https://stackoverflow.com/questions/9961795/ + # utf8-postgresql-create-database-like-mysql-including-character-set-encoding-a + + # psycopg2.ProgrammingError: invalid locale name: "en_US.UTF-8" + # https://stackoverflow.com/questions/40673339/ + # creating-utf-8-database-in-postgresql-on-windows10 + + # cur.execute("CREATE DATABASE %s ENCODING 'UTF8' LC_COLLATE 'en-US' LC_CTYPE 'en-US'" % dbname) + # psycopg2.DataError: new collation (en-US) is incompatible with the collation of the template database + # (Chinese (Simplified)_People's Republic of China.936) + # HINT: Use the same collation as in the template database, or use template0 as template. + try: + cur.execute("CREATE DATABASE %s ENCODING 'UTF8' LC_COLLATE 'en_US.UTF-8' LC_CTYPE 'en_US.UTF-8'" % dbname) + except: + try: + cur.execute("CREATE DATABASE %s" % dbname) + except Exception as err: + # psycopg2.ProgrammingError: database "scrapydweb_apscheduler" already exists + if 'exists' in str(err): + pass + else: + raise + cur.close() + conn.close() diff --git a/utils/sub_process.py b/utils/sub_process.py new file mode 100644 index 00000000..4e1c7232 --- /dev/null +++ b/utils/sub_process.py @@ -0,0 +1,125 @@ +# coding: utf-8 +import atexit +from ctypes import cdll +import logging +import os +import platform +import signal +from subprocess import Popen +import sys + +from ..common import json_dumps + + +logger = logging.getLogger(__name__) + +CURRENT_DIR = os.path.dirname(os.path.abspath(__file__)) + + +# https://stackoverflow.com/a/13256908/10517783 +# https://stackoverflow.com/a/23587108/10517783 +# http://evans.io/legacy/posts/killing-child-processes-on-parent-exit-prctl/ +class PrCtlError(Exception): + pass + + +def on_parent_exit(signame): + """ + Return a function to be run in a child process which will trigger + SIGNAME to be sent when the parent process dies + """ + # On Windows, signal() can only be called with SIGABRT, SIGFPE, SIGILL, SIGINT, SIGSEGV, or SIGTERM. + signum = getattr(signal, signame) # SIGTERM 15 SIGKILL 9 + + def set_parent_exit_signal(): + # Constant taken from http://linux.die.net/include/linux/prctl.h + PR_SET_PDEATHSIG = 1 + # http://linux.die.net/man/2/prctl + result = cdll['libc.so.6'].prctl(PR_SET_PDEATHSIG, signum) + if result != 0: + raise PrCtlError('prctl failed with error code %s' % result) + + return set_parent_exit_signal + + +# https://stackoverflow.com/a/19448255/10517783 +def kill_child(proc, title=''): + proc.kill() + # A None value indicates that the process has not terminated yet. + # A negative value -N indicates that the child was terminated by signal N (Unix only). + logger.warning('%s subprocess (pid: %s) killed with returncode: %s', title, proc.pid, proc.wait()) + + +def init_logparser(config): + logparser_subprocess = start_logparser(config) + logparser_pid = logparser_subprocess.pid + logger.info("Running LogParser in the background with pid: %s", logparser_pid) + atexit.register(kill_child, logparser_subprocess, 'LogParser') + return logparser_pid + + +def start_logparser(config): + args = [ + sys.executable, + '-m', + 'logparser.run', + '-dir', + config['LOCAL_SCRAPYD_LOGS_DIR'], + '--main_pid', + str(config['MAIN_PID']), + ] + + if platform.system() == 'Linux': + kwargs = dict(preexec_fn=on_parent_exit('SIGKILL')) # 'SIGTERM' 'SIGKILL' + try: + logparser_subprocess = Popen(args, **kwargs) + except Exception as err: + logger.error(err) + logparser_subprocess = Popen(args) + else: + logparser_subprocess = Popen(args) + + return logparser_subprocess + + +def init_poll(config): + poll_subprocess = start_poll(config) + poll_pid = poll_subprocess.pid + logger.info("Start polling job stats for monitor & alert in the background with pid: %s", poll_pid) + atexit.register(kill_child, poll_subprocess, 'Poll') + return poll_pid + + +def start_poll(config): + args = [ + sys.executable, + os.path.join(CURRENT_DIR, 'poll.py'), + + config['URL_SCRAPYDWEB'], + config.get('USERNAME', '') if config.get('ENABLE_AUTH', False) else '', + config.get('PASSWORD', '') if config.get('ENABLE_AUTH', False) else '', + json_dumps(config.get('SCRAPYD_SERVERS', ['127.0.0.1'])), + json_dumps(config.get('SCRAPYD_SERVERS_AUTHS', [None])), + str(config.get('POLL_ROUND_INTERVAL', 300)), + str(config.get('POLL_REQUEST_INTERVAL', 10)), + str(config['MAIN_PID']), + str(config.get('VERBOSE', False)) + ] + + # 'Windows': + # AttributeError: module 'signal' has no attribute 'SIGKILL' + # ValueError: preexec_fn is not supported on Windows platforms + # macOS('Darwin'): + # subprocess.SubprocessError: Exception occurred in preexec_fn. + # OSError: dlopen(libc.so.6, 6): image not found + if platform.system() == 'Linux': + kwargs = dict(preexec_fn=on_parent_exit('SIGKILL')) # 'SIGTERM' 'SIGKILL' + try: + poll_subprocess = Popen(args, **kwargs) + except Exception as err: + logger.error(err) + poll_subprocess = Popen(args) + else: + poll_subprocess = Popen(args) + + return poll_subprocess diff --git a/vars.py b/vars.py new file mode 100644 index 00000000..e2850cd4 --- /dev/null +++ b/vars.py @@ -0,0 +1,160 @@ +# coding: utf-8 +import glob +import importlib +import io +import os +import re +import sys + +from apscheduler.schedulers.base import STATE_PAUSED, STATE_RUNNING, STATE_STOPPED + +from .default_settings import DATA_PATH as default_data_path +from .default_settings import DATABASE_URL as default_database_url +from .utils.setup_database import setup_database + + +PYTHON_VERSION = ".".join([str(n) for n in sys.version_info[:3]]) +PY2 = sys.version_info.major < 3 +SCRAPYDWEB_SETTINGS_PY = "scrapydweb_settings_v10.py" +sys.path.append(os.getcwd()) +try: + custom_settings_module = importlib.import_module( + os.path.splitext(SCRAPYDWEB_SETTINGS_PY)[0] + ) +except ImportError: + custom_data_path = "" + custom_database_url = "" +else: + custom_data_path = getattr(custom_settings_module, "DATA_PATH", "") + custom_data_path = custom_data_path if isinstance(custom_data_path, str) else "" + custom_database_url = getattr(custom_settings_module, "DATABASE_URL", "") + custom_database_url = ( + custom_database_url if isinstance(custom_database_url, str) else "" + ) + custom_max_running_hours = getattr(custom_settings_module, "MAX_HOURS", "") +# For data storage +ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) + +DATA_PATH = default_data_path or custom_data_path +if DATA_PATH: + DATA_PATH = os.path.abspath(DATA_PATH) +else: + DATA_PATH = os.path.join(ROOT_DIR, "data") + +DATABASE_PATH = os.path.join(DATA_PATH, "database") +DEMO_PROJECTS_PATH = os.path.join(DATA_PATH, "demo_projects") +DEPLOY_PATH = os.path.join(DATA_PATH, "deploy") +HISTORY_LOG = os.path.join(DATA_PATH, "history_log") +PARSE_PATH = os.path.join(DATA_PATH, "parse") +SCHEDULE_PATH = os.path.join(DATA_PATH, "schedule") +STATS_PATH = os.path.join(DATA_PATH, "stats") + +for path in [ + DATA_PATH, + DATABASE_PATH, + DEMO_PROJECTS_PATH, + DEPLOY_PATH, + HISTORY_LOG, + PARSE_PATH, + SCHEDULE_PATH, + STATS_PATH, +]: + if not os.path.isdir(path): + os.mkdir(path) + elif path in [PARSE_PATH, DEPLOY_PATH, SCHEDULE_PATH]: + for file in glob.glob(os.path.join(path, "*.*")): + if not os.path.split(file)[-1] in ["ScrapydWeb_demo.log"]: + os.remove(file) + +RUN_SPIDER_HISTORY_LOG = os.path.join(HISTORY_LOG, "run_spider_history.log") +TIMER_TASKS_HISTORY_LOG = os.path.join(HISTORY_LOG, "timer_tasks_history.log") + +# For database +DATABASE_URL = ( + custom_database_url or default_database_url or "sqlite:///" + DATABASE_PATH +) +results = setup_database(DATABASE_URL, DATABASE_PATH) +( + APSCHEDULER_DATABASE_URI, + SQLALCHEMY_DATABASE_URI, + SQLALCHEMY_BINDS, + DATABASE_PATH, +) = results + +# For check_app_config() and BaseView +ALLOWED_SCRAPYD_LOG_EXTENSIONS = [".log", ".log.gz", ".txt", ".gz", ""] +ALERT_TRIGGER_KEYS = ["CRITICAL", "ERROR", "WARNING", "REDIRECT", "RETRY", "IGNORE"] + +# Error: Project names must begin with a letter and contain only letters, numbers and underscores +STRICT_NAME_PATTERN = re.compile(r"[^0-9A-Za-z_]") +LEGAL_NAME_PATTERN = re.compile(r"[^0-9A-Za-z_-]") + +# For ScheduleView +SCHEDULE_ADDITIONAL = "-d setting=CLOSESPIDER_TIMEOUT=60\r\n-d setting=CLOSESPIDER_PAGECOUNT=10\r\n-d arg1=val1" +UA_DICT = { + "custom": "Mozilla/5.0", + "Chrome": ( + "Mozilla/5.0 (Windows NT 10.0; WOW64) " + "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36" + ), + "iPhone": ( + "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) " + "AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1" + ), + "iPad": ( + "Mozilla/5.0 (iPad; CPU OS 12_1_4 like Mac OS X) " + "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Mobile/15E148 Safari/604.1" + ), + "Android": ( + "Mozilla/5.0 (Linux; Android 8.0.0; Pixel 2 XL Build/OPD1.170816.004) " + "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Mobile Safari/537.36" + ), +} + +# For LogsView and ItemsView +DIRECTORY_PATTERN = re.compile( + r""" + odd|even)">\n + \s+(?P.*?)\n + \s+(?P.*?)\n + \s+(?P.*?)\n + \s+(?P.*?)\n + + """, + re.X, +) +DIRECTORY_KEYS = ["odd_even", "filename", "size", "content_type", "content_encoding"] +HREF_NAME_PATTERN = re.compile(r'href="(.+?)">(.+?)<') + +# For JobsView +jobs_table_map = {} + +# For Timer Tasks +# STATE_STOPPED = 0, STATE_RUNNING = 1, STATE_PAUSED = 2 +SCHEDULER_STATE_DICT = { + STATE_STOPPED: "STATE_STOPPED", + STATE_RUNNING: "STATE_RUNNING", + STATE_PAUSED: "STATE_PAUSED", +} + +# For Setting Max Amount Of Hours A Scraper Can Run +# Set custom hours in the scrapydweb_settings file +MAX_HOURS = custom_max_running_hours or 3 + + +def setup_logfile(delete=False): + if delete: + for logfile in [RUN_SPIDER_HISTORY_LOG, TIMER_TASKS_HISTORY_LOG]: + if os.path.exists(logfile): + os.remove(logfile) + + if not os.path.exists(RUN_SPIDER_HISTORY_LOG): + with io.open(RUN_SPIDER_HISTORY_LOG, "w", encoding="utf-8") as f: + f.write(u"%s\n%s" % ("#" * 50, RUN_SPIDER_HISTORY_LOG)) + + if not os.path.exists(TIMER_TASKS_HISTORY_LOG): + with io.open(TIMER_TASKS_HISTORY_LOG, "w", encoding="utf-8") as f: + f.write(u"%s\n%s\n\n" % (TIMER_TASKS_HISTORY_LOG, "#" * 50)) + + +setup_logfile(delete=False) diff --git a/views/__init__.py b/views/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/views/api.py b/views/api.py new file mode 100644 index 00000000..abd4920a --- /dev/null +++ b/views/api.py @@ -0,0 +1,118 @@ +# coding: utf-8 +import re +import time + +from .baseview import BaseView + + +API_MAP = dict(start='schedule', stop='cancel', forcestop='cancel', liststats='logs/stats') + + +class ApiView(BaseView): + + def __init__(self): + super(ApiView, self).__init__() + + self.opt = self.view_args['opt'] + self.project = self.view_args['project'] + self.version_spider_job = self.view_args['version_spider_job'] + + self.url = 'http://{}/{}.json'.format(self.SCRAPYD_SERVER, API_MAP.get(self.opt, self.opt)) + self.data = None + self.status_code = 0 + self.js = {} + + def dispatch_request(self, **kwargs): + self.update_url() + self.update_data() + self.get_result() + self.handle_result() + return self.json_dumps(self.js, sort_keys=False, as_response=True) + + def update_url(self): + if self.opt in ['listversions', 'listjobs']: + self.url += '?project=%s' % self.project + elif self.opt == 'listspiders': + if self.version_spider_job == self.DEFAULT_LATEST_VERSION: + self.url += '?project=%s' % self.project + else: + # Should be _version + self.url += '?project=%s&_version=%s' % (self.project, self.version_spider_job) + + def update_data(self): + self.data = dict(project=self.project) + if self.opt == 'start': + self.data['spider'] = self.version_spider_job + self.data['jobid'] = self.get_now_string() + elif self.opt in ['stop', 'forcestop']: + self.data['job'] = self.version_spider_job + elif self.opt == 'delversion': + self.data['version'] = self.version_spider_job + elif self.opt == 'delproject': + pass + else: + self.data = None + + def get_result(self): + timeout = 3 if self.opt == 'daemonstatus' else 60 + dumps_json = self.opt not in ['daemonstatus', 'liststats'] + times = 2 if self.opt == 'forcestop' else 1 + for __ in range(times): + self.status_code, self.js = self.make_request(self.url, data=self.data, auth=self.AUTH, + as_json=True, dumps_json=dumps_json, timeout=timeout) + if times != 1: + self.js['times'] = times + time.sleep(2) + + def handle_result(self): + if self.status_code != 200: + if self.opt == 'liststats': + if self.project and self.version_spider_job: # 'List Stats' in the Servers page + if self.status_code == 404: + self.js = dict(status=self.OK, tip="'pip install logparser' and run command 'logparser'") + else: # XMLHttpRequest in the Jobs page; POST in jobs.py + self.js['tip'] = ("'pip install logparser' on host '%s' and run command 'logparser' " + "to show crawled_pages and scraped_items. ") % self.SCRAPYD_SERVER + else: + self.js['tip'] = "Make sure that your Scrapyd server is accessable. " + elif self.js['status'] != self.OK: + if re.search('No such file|no active project', self.js.get('message', '')): + self.js['tip'] = "Maybe the project had been deleted, check out the Projects page. " + elif self.opt == 'listversions': + self.js['tip'] = ( + "Maybe it's caused by failing to compare versions, " + "you can check out the HELP section in the Deploy Project page for more info, " + "and solve the problem in the Projects page. " + ) + elif self.opt == 'listspiders' and re.search("TypeError: 'tuple'", self.js.get('message', '')): + self.js['tip'] = "Maybe it's a broken project, check out the Projects page to delete it. " + elif self.opt == 'liststats': + if self.js.get('logparser_version') != self.LOGPARSER_VERSION: + if self.project and self.version_spider_job: # 'List Stats' in the Servers page + tip = "'pip install --upgrade logparser' to update LogParser to v%s" % self.LOGPARSER_VERSION + self.js = dict(status=self.OK, tip=tip) + else: # XMLHttpRequest in the Jobs page; POST in jobs.py + self.js['tip'] = ("'pip install --upgrade logparser' on host '%s' and run command 'logparser' " + "to update LogParser to v%s") % (self.SCRAPYD_SERVER, self.LOGPARSER_VERSION) + self.js['status'] = self.ERROR + elif self.project and self.version_spider_job: # 'List Stats' in the Servers page + self.extract_pages_items() + + def extract_pages_items(self): + details = None + if self.project in self.js['datas']: + for spider in self.js['datas'][self.project]: + for jobid in self.js['datas'][self.project][spider]: + if jobid == self.version_spider_job: + details = self.js['datas'][self.project][spider][self.version_spider_job] + self.js['project'] = self.project + self.js['spider'] = spider + self.js['jobid'] = jobid + break + if not details: + details = dict(pages=self.NA, items=self.NA) + details.setdefault('project', self.project) + details.setdefault('spider', self.NA) + details.setdefault('jobid', self.version_spider_job) + details['logparser_version'] = self.js.get('logparser_version', None) + self.js = dict(status=self.OK, details=details) diff --git a/views/baseview.py b/views/baseview.py new file mode 100644 index 00000000..d17c91fa --- /dev/null +++ b/views/baseview.py @@ -0,0 +1,571 @@ +# coding: utf-8 +import logging +import os +import re + +from flask import current_app as app +from flask import Response, flash, g, request, url_for +from flask.views import View +from logparser import __version__ as LOGPARSER_VERSION +from six import text_type + +from ..__version__ import __version__ as SCRAPYDWEB_VERSION +from ..common import ( + get_now_string, + get_response_from_view, + handle_metadata, + handle_slash, + json_dumps, + session, +) +from ..vars import ( + ALLOWED_SCRAPYD_LOG_EXTENSIONS, + APSCHEDULER_DATABASE_URI, + DATA_PATH, + DEMO_PROJECTS_PATH, + DEPLOY_PATH, + PARSE_PATH, + ALERT_TRIGGER_KEYS, + LEGAL_NAME_PATTERN, + SCHEDULE_ADDITIONAL, + SCHEDULE_PATH, + STATE_PAUSED, + STATE_RUNNING, + STATS_PATH, + STRICT_NAME_PATTERN, +) +from ..utils.scheduler import scheduler + + +class BaseView(View): + SCRAPYDWEB_VERSION = SCRAPYDWEB_VERSION + LOGPARSER_VERSION = LOGPARSER_VERSION + + DEMO_PROJECTS_PATH = DEMO_PROJECTS_PATH + DEPLOY_PATH = DEPLOY_PATH + PARSE_PATH = PARSE_PATH + SCHEDULE_PATH = SCHEDULE_PATH + STATS_PATH = STATS_PATH + + OK = "ok" + ERROR = "error" + NA = "N/A" + INFO = "info" + WARN = "warning" + DEFAULT_LATEST_VERSION = "default: the latest version" + LEGAL_NAME_PATTERN = LEGAL_NAME_PATTERN + STRICT_NAME_PATTERN = STRICT_NAME_PATTERN + ALERT_TRIGGER_KEYS = ALERT_TRIGGER_KEYS + + methods = ["GET", "POST"] + + def __init__(self, *args, **kwargs): + self.logger = logging.getLogger(self.__class__.__name__) + # Not in the config file + self.DEFAULT_SETTINGS_PY_PATH = app.config["DEFAULT_SETTINGS_PY_PATH"] + self.SCRAPYDWEB_SETTINGS_PY_PATH = app.config["SCRAPYDWEB_SETTINGS_PY_PATH"] + self.MAIN_PID = app.config["MAIN_PID"] + self.LOGPARSER_PID = app.config["LOGPARSER_PID"] + self.POLL_PID = app.config["POLL_PID"] + + # System + self.DEBUG = app.config.get("DEBUG", False) + self.VERBOSE = app.config.get("VERBOSE", False) + self.DATA_PATH = DATA_PATH + self.APSCHEDULER_DATABASE_URI = APSCHEDULER_DATABASE_URI + self.SQLALCHEMY_DATABASE_URI = app.config["SQLALCHEMY_DATABASE_URI"] + self.SQLALCHEMY_BINDS = app.config["SQLALCHEMY_BINDS"] + + _level = logging.DEBUG if self.VERBOSE else logging.INFO + self.logger.setLevel(_level) + logging.getLogger("requests").setLevel(_level) + logging.getLogger("urllib3").setLevel(_level) + + # if app.testing: + self.logger.debug( + "view_args of %s\n%s", request.url, self.json_dumps(request.view_args) + ) + if request.args: + self.logger.debug( + "request.args of %s\n%s", request.url, self.json_dumps(request.args) + ) + if request.form: + self.logger.debug( + "request.form from %s\n%s", request.url, self.json_dumps(request.form) + ) + if request.json: + self.logger.debug( + "request.json from %s\n%s", request.url, self.json_dumps(request.json) + ) + if request.files: + self.logger.debug( + "request.files from %s\n\n %s\n", request.url, request.files + ) + + # ScrapydWeb + self.SCRAPYDWEB_BIND = app.config.get("SCRAPYDWEB_BIND", "0.0.0.0") + self.SCRAPYDWEB_PORT = app.config.get("SCRAPYDWEB_PORT", 5000) + + self.ENABLE_AUTH = app.config.get("ENABLE_AUTH", False) + self.USERNAME = app.config.get("USERNAME", "") + self.PASSWORD = app.config.get("PASSWORD", "") + + self.ENABLE_HTTPS = app.config.get("ENABLE_HTTPS", False) + self.CERTIFICATE_FILEPATH = app.config.get("CERTIFICATE_FILEPATH", "") + self.PRIVATEKEY_FILEPATH = app.config.get("PRIVATEKEY_FILEPATH", "") + + self.URL_SCRAPYDWEB = app.config.get("URL_SCRAPYDWEB", "http://127.0.0.1:5000") + + # Scrapy + self.SCRAPY_PROJECTS_DIR = ( + app.config.get("SCRAPY_PROJECTS_DIR", "") or self.DEMO_PROJECTS_PATH + ) + + # Scrapyd + self.SCRAPYD_SERVERS = app.config.get("SCRAPYD_SERVERS", []) or [ + "127.0.0.1:6800" + ] + self.SCRAPYD_SERVERS_AMOUNT = len(self.SCRAPYD_SERVERS) + self.SCRAPYD_SERVERS_GROUPS = app.config.get("SCRAPYD_SERVERS_GROUPS", []) or [ + "" + ] + self.SCRAPYD_SERVERS_AUTHS = app.config.get("SCRAPYD_SERVERS_AUTHS", []) or [ + None + ] + self.SCRAPYD_SERVERS_PUBLIC_URLS = ( + app.config.get("SCRAPYD_SERVERS_PUBLIC_URLS", None) + or [""] * self.SCRAPYD_SERVERS_AMOUNT + ) + + self.LOCAL_SCRAPYD_SERVER = app.config.get("LOCAL_SCRAPYD_SERVER", "") + self.LOCAL_SCRAPYD_LOGS_DIR = app.config.get("LOCAL_SCRAPYD_LOGS_DIR", "") + self.SCRAPYD_LOG_EXTENSIONS = ( + app.config.get("SCRAPYD_LOG_EXTENSIONS", []) + or ALLOWED_SCRAPYD_LOG_EXTENSIONS + ) + + # LogParser + self.ENABLE_LOGPARSER = app.config.get("ENABLE_LOGPARSER", False) + self.BACKUP_STATS_JSON_FILE = app.config.get("BACKUP_STATS_JSON_FILE", True) + + # Timer Tasks + self.scheduler = scheduler + self.JOBS_SNAPSHOT_INTERVAL = app.config.get("JOBS_SNAPSHOT_INTERVAL", 300) + + # Run Spider + self.SCHEDULE_EXPAND_SETTINGS_ARGUMENTS = app.config.get( + "SCHEDULE_EXPAND_SETTINGS_ARGUMENTS", False + ) + self.SCHEDULE_CUSTOM_USER_AGENT = app.config.get( + "SCHEDULE_CUSTOM_USER_AGENT", "Mozilla/5.0" + ) + self.SCHEDULE_USER_AGENT = app.config.get("SCHEDULE_USER_AGENT", None) + self.SCHEDULE_ROBOTSTXT_OBEY = app.config.get("SCHEDULE_ROBOTSTXT_OBEY", None) + self.SCHEDULE_COOKIES_ENABLED = app.config.get("SCHEDULE_COOKIES_ENABLED", None) + self.SCHEDULE_CONCURRENT_REQUESTS = app.config.get( + "SCHEDULE_CONCURRENT_REQUESTS", None + ) + self.SCHEDULE_DOWNLOAD_DELAY = app.config.get("SCHEDULE_DOWNLOAD_DELAY", None) + self.SCHEDULE_ADDITIONAL = app.config.get( + "SCHEDULE_ADDITIONAL", SCHEDULE_ADDITIONAL + ) + + # Page Display + self.SHOW_SCRAPYD_ITEMS = app.config.get("SHOW_SCRAPYD_ITEMS", True) + self.SHOW_JOBS_JOB_COLUMN = app.config.get("SHOW_JOBS_JOB_COLUMN", False) + self.JOBS_FINISHED_JOBS_LIMIT = app.config.get("JOBS_FINISHED_JOBS_LIMIT", 0) + self.JOBS_RELOAD_INTERVAL = app.config.get("JOBS_RELOAD_INTERVAL", 300) + self.LONG_RUNNING_SCRAPER_STOP_INTERVAL = app.config.get( + "LONG_RUNNING_SCRAPER_STOP_INTERVAL", 300 + ) + self.DAEMONSTATUS_REFRESH_INTERVAL = app.config.get( + "DAEMONSTATUS_REFRESH_INTERVAL", 10 + ) + + # Send text + self.SLACK_TOKEN = app.config.get("SLACK_TOKEN", "") + self.SLACK_CHANNEL = app.config.get("SLACK_CHANNEL", "") or "general" + self.TELEGRAM_TOKEN = app.config.get("TELEGRAM_TOKEN", "") + self.TELEGRAM_CHAT_ID = app.config.get("TELEGRAM_CHAT_ID", 0) + self.EMAIL_SUBJECT = ( + app.config.get("EMAIL_SUBJECT", "") or "Email from #scrapydweb" + ) + + # Monitor & Alert + self.ENABLE_MONITOR = app.config.get("ENABLE_MONITOR", False) + self.ENABLE_SLACK_ALERT = app.config.get("ENABLE_SLACK_ALERT", False) + self.ENABLE_TELEGRAM_ALERT = app.config.get("ENABLE_TELEGRAM_ALERT", False) + self.ENABLE_EMAIL_ALERT = app.config.get("ENABLE_EMAIL_ALERT", False) + + self.EMAIL_SENDER = app.config.get("EMAIL_SENDER", "") + self.EMAIL_RECIPIENTS = app.config.get("EMAIL_RECIPIENTS", []) + self.EMAIL_USERNAME = app.config.get("EMAIL_USERNAME", "") or self.EMAIL_SENDER + self.EMAIL_PASSWORD = app.config.get("EMAIL_PASSWORD", "") + + self.SMTP_SERVER = app.config.get("SMTP_SERVER", "") + self.SMTP_PORT = app.config.get("SMTP_PORT", 0) + self.SMTP_OVER_SSL = app.config.get("SMTP_OVER_SSL", False) + self.SMTP_CONNECTION_TIMEOUT = app.config.get("SMTP_CONNECTION_TIMEOUT", 30) + + self.EMAIL_KWARGS = dict( + email_username=self.EMAIL_USERNAME, + email_password=self.EMAIL_PASSWORD, + email_sender=self.EMAIL_SENDER, + email_recipients=self.EMAIL_RECIPIENTS, + smtp_server=self.SMTP_SERVER, + smtp_port=self.SMTP_PORT, + smtp_over_ssl=self.SMTP_OVER_SSL, + smtp_connection_timeout=self.SMTP_CONNECTION_TIMEOUT, + subject="subject", + content="content", + ) + + self.POLL_ROUND_INTERVAL = app.config.get("POLL_ROUND_INTERVAL", 300) + self.POLL_REQUEST_INTERVAL = app.config.get("POLL_REQUEST_INTERVAL", 10) + self.ALERT_WORKING_DAYS = app.config.get("ALERT_WORKING_DAYS", []) + self.ALERT_WORKING_HOURS = app.config.get("ALERT_WORKING_HOURS", []) + self.ON_JOB_RUNNING_INTERVAL = app.config.get("ON_JOB_RUNNING_INTERVAL", 0) + self.ON_JOB_FINISHED = app.config.get("ON_JOB_FINISHED", False) + # ['CRITICAL', 'ERROR', 'WARNING', 'REDIRECT', 'RETRY', 'IGNORE'] + for key in self.ALERT_TRIGGER_KEYS: + setattr( + self, + "LOG_%s_THRESHOLD" % key, + app.config.get("LOG_%s_THRESHOLD" % key, 0), + ) + setattr( + self, + "LOG_%s_TRIGGER_STOP" % key, + app.config.get("LOG_%s_TRIGGER_STOP" % key, False), + ) + setattr( + self, + "LOG_%s_TRIGGER_FORCESTOP" % key, + app.config.get("LOG_%s_TRIGGER_FORCESTOP" % key, False), + ) + + # Other attributes not from config + self.view_args = request.view_args + self.node = self.view_args["node"] + assert 0 < self.node <= self.SCRAPYD_SERVERS_AMOUNT, ( + "node index error: %s, which should be between 1 and %s" + % (self.node, self.SCRAPYD_SERVERS_AMOUNT) + ) + self.SCRAPYD_SERVER = self.SCRAPYD_SERVERS[self.node - 1] + self.IS_LOCAL_SCRAPYD_SERVER = self.SCRAPYD_SERVER == self.LOCAL_SCRAPYD_SERVER + self.GROUP = self.SCRAPYD_SERVERS_GROUPS[self.node - 1] + self.AUTH = self.SCRAPYD_SERVERS_AUTHS[self.node - 1] + self.SCRAPYD_SERVER_PUBLIC_URL = self.SCRAPYD_SERVERS_PUBLIC_URLS[self.node - 1] + + ua = request.headers.get("User-Agent", "") + m_mobile = re.search( + r"Android|webOS|iPad|iPhone|iPod|BlackBerry|IEMobile|Opera Mini", ua, re.I + ) + self.IS_MOBILE = True if m_mobile else False + + m_ipad = re.search(r"iPad", ua, re.I) + self.IS_IPAD = True if m_ipad else False + + # http://werkzeug.pocoo.org/docs/0.14/utils/#module-werkzeug.useragents + # /site-packages/werkzeug/useragents.py + browser = request.user_agent.browser or "" # lib requests GET: None + m_edge = re.search(r"Edge", ua, re.I) + self.IS_IE_EDGE = True if (browser == "msie" or m_edge) else False + + self.USE_MOBILEUI = request.args.get("ui", "") == "mobile" + self.UI = "mobile" if self.USE_MOBILEUI else None + self.GET = request.method == "GET" + self.POST = request.method == "POST" + + self.FEATURES = "" + self.FEATURES += "A" if self.ENABLE_AUTH else "-" + self.FEATURES += ( + "D" if handle_metadata().get("jobs_style") == "database" else "C" + ) + self.FEATURES += ( + "d" if self.SCRAPY_PROJECTS_DIR != self.DEMO_PROJECTS_PATH else "-" + ) + self.FEATURES += "L" if self.ENABLE_LOGPARSER else "-" + self.FEATURES += "Sl" if self.ENABLE_SLACK_ALERT else "-" + self.FEATURES += "Tg" if self.ENABLE_TELEGRAM_ALERT else "-" + self.FEATURES += "Em" if self.ENABLE_EMAIL_ALERT else "-" + self.FEATURES += "P" if self.IS_MOBILE else "-" + self.FEATURES += "M" if self.USE_MOBILEUI else "-" + self.FEATURES += "S" if self.ENABLE_HTTPS else "-" + self.any_running_apscheduler_jobs = any( + job.next_run_time for job in self.scheduler.get_jobs(jobstore="default") + ) + if self.scheduler.state == STATE_PAUSED: + self.FEATURES += "-" + elif self.any_running_apscheduler_jobs: + self.FEATURES += "T" + else: + self.FEATURES += "t" + if not self.SQLALCHEMY_DATABASE_URI.startswith("sqlite"): + self.FEATURES += self.SQLALCHEMY_DATABASE_URI[:3] + + self.template_fail = ( + "scrapydweb/fail_mobileui.html" + if self.USE_MOBILEUI + else "scrapydweb/fail.html" + ) + self.update_g() + + @staticmethod + def get_job_without_ext(job): + if job.endswith(".tar.gz"): + return job[: -len(".tar.gz")] + else: + return os.path.splitext(job)[0] # '1.1.log' => ('1.1', '.log') + + @staticmethod + def get_now_string(allow_space=False): + return get_now_string(allow_space=allow_space) + + def get_response_from_view(self, url, data=None, as_json=False): + auth = (self.USERNAME, self.PASSWORD) if self.ENABLE_AUTH else None + return get_response_from_view(url, auth=auth, data=data, as_json=as_json) + + def get_selected_nodes(self): + selected_nodes = [] + for n in range(1, self.SCRAPYD_SERVERS_AMOUNT + 1): + if request.form.get(str(n)) == "on": + selected_nodes.append(n) + return selected_nodes + + @staticmethod + def handle_slash(string): + return handle_slash(string) + + @staticmethod + def json_dumps( + obj, sort_keys=True, indent=4, ensure_ascii=False, as_response=False + ): + # flask.jsonify + # https://flask.palletsprojects.com/en/1.1.x/config/#JSONIFY_MIMETYPE + # https://stackoverflow.com/questions/11773348/python-flask-how-to-set-content-type + # https://stackoverflow.com/questions/9254891/what-does-content-type-application-json-charset-utf-8-really-mean + js = json_dumps( + obj, sort_keys=sort_keys, indent=indent, ensure_ascii=ensure_ascii + ) + if as_response: + # Content-Type: application/json + return Response(js, mimetype="application/json") + else: + return js + + @staticmethod + def remove_microsecond(dt): + return str(dt)[:19] + + def make_request( + self, + url, + data=None, + auth=None, + as_json=True, + dumps_json=True, + check_status=True, + timeout=60, + ): + """ + :param url: url to make request + :param data: None or a dict object to post + :param auth: None or (username, password) for basic auth + :param as_json: return a dict object if set True, else text + :param dumps_json: whether to dumps the json response when as_json is set to True + :param check_status: whether to log error when status != 'ok' + :param timeout: timeout when making request, in seconds + """ + try: + if "addversion.json" in url and data: + self.logger.debug(">>>>> POST %s", url) + self.logger.debug( + self.json_dumps( + dict( + project=data["project"], + version=data["version"], + egg="%s bytes binary egg file" % len(data["egg"]), + ) + ) + ) + else: + self.logger.debug(">>>>> %s %s", "POST" if data else "GET", url) + if data: + self.logger.debug("POST data: %s", self.json_dumps(data)) + + if data: + r = session.post(url, data=data, auth=auth, timeout=timeout) + else: + r = session.get(url, auth=auth, timeout=timeout) + r.encoding = "utf-8" + except Exception as err: + # self.logger.error('!!!!! %s %s' % (err.__class__.__name__, err)) + self.logger.error("!!!!! error with %s: %s", url, err) + if as_json: + r_json = dict( + url=url, + auth=auth, + status_code=-1, + status=self.ERROR, + message=str(err), + when=self.get_now_string(True), + ) + return -1, r_json + else: + return -1, str(err) + else: + if as_json: + r_json = {} + try: + # listprojects would get 502 html when Scrapyd server reboots + r_json = ( + r.json() + ) # PY3: json.decoder.JSONDecodeError PY2: exceptions.ValueError + except ValueError as err: # issubclass(JSONDecodeError, ValueError) + self.logger.error("Fail to decode json from %s: %s", url, err) + r_json = dict(status=self.ERROR, message=r.text) + finally: + # Scrapyd in Python2: Traceback (most recent call last):\\n + # Scrapyd in Python3: Traceback (most recent call last):\r\n + message = r_json.get("message", "") + if message and not isinstance(message, dict): + r_json["message"] = re.sub(r"\\n", "\n", message) + r_json.update( + dict( + url=url, + auth=auth, + status_code=r.status_code, + when=self.get_now_string(True), + ) + ) + status = r_json.setdefault("status", self.NA) + if r.status_code != 200 or (check_status and status != self.OK): + self.logger.error( + "!!!!! (%s) %s: %s", r.status_code, status, url + ) + else: + self.logger.debug( + "<<<<< (%s) %s: %s", r.status_code, status, url + ) + if dumps_json: + self.logger.debug( + "Got json from %s: %s", url, self.json_dumps(r_json) + ) + else: + self.logger.debug( + "Got keys from (%s) %s %s: %s", + r_json.get("status_code"), + r_json.get("status"), + url, + r_json.keys(), + ) + + return r.status_code, r_json + else: + if r.status_code == 200: + _text = ( + r.text[:100] + "......" + r.text[-100:] + if len(r.text) > 200 + else r.text + ) + self.logger.debug( + "<<<<< (%s) %s\n%s", r.status_code, url, repr(_text) + ) + else: + self.logger.error("!!!!! (%s) %s\n%s", r.status_code, url, r.text) + + return r.status_code, r.text + + def update_g(self): + # g lifetime: every single request + # Note that use inject_variable() in View class would cause memory leak, issue #14 + g.IS_MOBILE = self.IS_MOBILE + g.url_jobs_list = [ + url_for("jobs", node=node, ui=self.UI) + for node in range(1, self.SCRAPYD_SERVERS_AMOUNT + 1) + ] + g.multinode = ( + '" + ) + # For base.html + if not self.USE_MOBILEUI: + g.url_daemonstatus = url_for("api", node=self.node, opt="daemonstatus") + g.url_menu_servers = url_for("servers", node=self.node) + g.url_menu_jobs = url_for("jobs", node=self.node) + g.url_menu_nodereports = url_for("nodereports", node=self.node) + g.url_menu_clusterreports = url_for("clusterreports", node=self.node) + g.url_menu_tasks = url_for("tasks", node=self.node) + g.url_menu_deploy = url_for("deploy", node=self.node) + g.url_menu_schedule = url_for("schedule", node=self.node) + g.url_menu_projects = url_for("projects", node=self.node) + g.url_menu_logs = url_for("logs", node=self.node) + g.url_menu_items = url_for("items", node=self.node) + g.url_menu_sendtext = url_for("sendtext", node=self.node) + g.url_menu_parse = url_for("parse.upload", node=self.node) + g.url_menu_settings = url_for("settings", node=self.node) + g.url_menu_mobileui = url_for("index", node=self.node, ui="mobile") + g.scheduler_state_paused = ( + self.scheduler.state == STATE_PAUSED + and self.any_running_apscheduler_jobs + ) + g.scheduler_state_running = ( + self.scheduler.state == STATE_RUNNING + and self.any_running_apscheduler_jobs + ) + + # Issue#48 [PY2] UnicodeDecodeError raised when there are some files with illegal filenames in `SCRAPY_PROJECTS_DIR` + # https://stackoverflow.com/questions/21772271/unicodedecodeerror-when-performing-os-walk + # https://xuanwo.io/2018/04/01/python-os-walk/ + # Tested in Ubuntu: + # touch $(echo -e "\x8b\x8bFile") + # mkdir $(echo -e "\x8b\x8bFolder") + def safe_walk(self, top, topdown=True, onerror=None, followlinks=False): + islink, join, isdir = os.path.islink, os.path.join, os.path.isdir + + # touch $(echo -e "\x8b\x8bThis is a bad filename") + # ('top: ', u'/home/username/download/scrapydweb/scrapydweb/data/demo_projects/ScrapydWeb_demo') + # ('names: ', ['\x8b\x8bThis', u'ScrapydWeb_demo', u'filename', u'scrapy.cfg', u'a', u'is', u'bad']) + try: + names = os.listdir(top) + except OSError as err: + if onerror is not None: + onerror(err) + return + + new_names = [] + for name in names: + if isinstance(name, text_type): + new_names.append(name) + else: + msg = "Ignore non-unicode filename %s in %s" % (repr(name), top) + self.logger.error(msg) + flash(msg, self.WARN) + names = new_names + + dirs, nondirs = [], [] + for name in names: + if isdir(join(top, name)): + dirs.append(name) + else: + nondirs.append(name) + + if topdown: + yield top, dirs, nondirs + for name in dirs: + new_path = join(top, name) + if followlinks or not islink(new_path): + for x in self.safe_walk(new_path, topdown, onerror, followlinks): + yield x + if not topdown: + yield top, dirs, nondirs + + +class MetadataView(BaseView): + def __init__(self): + super(MetadataView, self).__init__() + + def dispatch_request(self, **kwargs): + return self.json_dumps(handle_metadata(), as_response=True) diff --git a/views/dashboard/__init__.py b/views/dashboard/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/views/dashboard/cluster_reports.py b/views/dashboard/cluster_reports.py new file mode 100644 index 00000000..5cb79cfb --- /dev/null +++ b/views/dashboard/cluster_reports.py @@ -0,0 +1,61 @@ +# coding: utf-8 +from flask import redirect, render_template, url_for + +from ..baseview import BaseView + + +metadata = dict( + project='', + spider='', + job='', + selected_nodes=[] +) + + +class ClusterReportsView(BaseView): + + def __init__(self): + super(ClusterReportsView, self).__init__() + + self.project = self.view_args['project'] or metadata['project'] + self.spider = self.view_args['spider'] or metadata['spider'] + self.job = self.view_args['job'] or metadata['job'] + self.selected_nodes = self.get_selected_nodes() or metadata['selected_nodes'] + metadata['project'] = self.project + metadata['spider'] = self.spider + metadata['job'] = self.job + metadata['selected_nodes'] = self.selected_nodes + + self.template = 'scrapydweb/cluster_reports.html' + + def dispatch_request(self, **kwargs): + if all([self.project, self.spider, self.job]): + # Click reports memu for the second time + if not any([self.view_args['project'], self.view_args['spider'], self.view_args['job']]): + return redirect(url_for('clusterreports', node=self.node, project=self.project, + spider=self.spider, job=self.job)) + # Click reports button on the Jobs page after reboot + if not self.selected_nodes: + return redirect(url_for('servers', node=self.node, opt='getreports', project=self.project, + spider=self.spider, version_job=self.job)) + + # Click reports memu for the first time + if not any([self.project, self.spider, self.job]): + url_servers = '' + else: + url_servers = url_for('servers', node=self.node, opt='getreports', project=self.project, + spider=self.spider, version_job=self.job) + + kwargs = dict( + node=self.node, + project=self.project, + spider=self.spider, + job=self.job, + selected_nodes=self.selected_nodes, + url_report=url_for('log', node=self.node, opt='report', project=self.project, + spider=self.spider, job=self.job), + url_servers=url_servers, + url_jobs=url_for('jobs', node=self.node), + # url_nodereports=url_for('nodereports', node=self.node), + ) + return render_template(self.template, **kwargs) diff --git a/views/dashboard/jobs.py b/views/dashboard/jobs.py new file mode 100644 index 00000000..f62eba9a --- /dev/null +++ b/views/dashboard/jobs.py @@ -0,0 +1,671 @@ +# coding: utf-8 +from collections import OrderedDict +from datetime import datetime +import re +import traceback + +from flask import flash, get_flashed_messages, render_template, request, url_for +from six.moves.urllib.parse import urljoin + +from ...common import handle_metadata +from ...models import create_jobs_table, db +from ...vars import STRICT_NAME_PATTERN, jobs_table_map +from ..baseview import BaseView + + +_metadata = handle_metadata() +metadata = dict( + pageview=_metadata.get("pageview", 1), + per_page=_metadata.get("jobs_per_page", 100), + style=_metadata.get("jobs_style", "database"), + unique_key_strings={}, +) + +STATUS_PENDING = "0" +STATUS_RUNNING = "1" +STATUS_FINISHED = "2" +NOT_DELETED = "0" +DELETED = "1" +HREF_PATTERN = re.compile( + r"""href=['"](.+?)['"]""" +) # Temp support for Scrapyd v1.3.0 (not released) +JOB_PATTERN = re.compile( + r""" + + (?P.*?) + (?P.*?) + (?P.*?) + (?:(?P.*?))? + (?:(?P.*?))? + (?:(?P.*?))? + (?:(?P.*?))? + (?:(?P.*?))? + (?:(?P.*?))? + [\w\W]*? # Temp support for Scrapyd v1.3.0 (not released) + + """, + re.X, +) +JOB_KEYS = [ + "project", + "spider", + "job", + "pid", + "start", + "runtime", + "finish", + "href_log", + "href_items", +] + + +class JobsView(BaseView): + # methods = ['GET'] + metadata = metadata + + def __init__(self): + super(JobsView, self).__init__() + + style = request.args.get("style") + self.style = ( + style if style in ["database", "classic"] else self.metadata["style"] + ) + if self.style != self.metadata["style"]: + self.metadata["style"] = self.style + handle_metadata("jobs_style", self.style) + self.logger.debug("Change style to %s", self.metadata["style"]) + + self.per_page = request.args.get( + "per_page", default=self.metadata["per_page"], type=int + ) + if self.per_page != self.metadata["per_page"]: + self.metadata["per_page"] = self.per_page + handle_metadata("jobs_per_page", self.per_page) + self.logger.debug("Change per_page to %s", self.metadata["per_page"]) + self.page = request.args.get("page", default=1, type=int) + + self.url = "http://%s/jobs" % self.SCRAPYD_SERVER + if self.SCRAPYD_SERVER_PUBLIC_URL: + self.public_url = "%s/jobs" % self.SCRAPYD_SERVER_PUBLIC_URL + else: + self.public_url = "" + self.text = "" + self.kwargs = {} + if self.USE_MOBILEUI: + self.style = "classic" + self.template = "scrapydweb/jobs_mobileui.html" + elif self.style == "classic": + self.template = "scrapydweb/jobs_classic.html" + else: # 'database' + self.template = "scrapydweb/jobs.html" + + self.listjobs = request.args.get("listjobs", None) + + self.liststats_datas = {} + self.jobs_dict = {} + + self.jobs = [] + self.jobs_backup = [] + self.pending_jobs = [] + self.running_jobs = [] + self.finished_jobs = [] + self.jobs_pagination = None + + self.Job = None # database class Job + + def dispatch_request(self, **kwargs): + status_code, self.text = self.make_request( + self.url, auth=self.AUTH, as_json=False + ) + if status_code != 200 or not re.search(r"

Jobs

", self.text): + kwargs = dict( + node=self.node, + url=self.url, + status_code=status_code, + text=self.text, + tip="Click the above link to make sure your Scrapyd server is accessable. ", + ) + return render_template(self.template_fail, **kwargs) + # Temp support for Scrapyd v1.3.0 (not released) + self.text = re.sub(r".*?", "", self.text, flags=re.S) + self.jobs = [ + dict(zip(JOB_KEYS, job)) for job in re.findall(JOB_PATTERN, self.text) + ] + self.jobs_backup = list(self.jobs) + + if self.listjobs: + return self.json_dumps(self.jobs, as_response=True) + + if self.POST: # To update self.liststats_datas + self.get_liststats_datas() + else: + self.metadata["pageview"] += 1 + self.logger.debug("metadata: %s", self.metadata) + self.set_flash() + if self.style == "database" or self.POST: + self.handle_jobs_with_db() + if self.POST: + try: + self.set_jobs_dict() + except: + raise + finally: + get_flashed_messages() + return self.json_dumps(self.jobs_dict, as_response=True) + if self.style != "database": + self.jobs = self.jobs_backup + self.handle_jobs_without_db() + self.set_kwargs() + return render_template(self.template, **self.kwargs) + + def set_flash(self): + if self.metadata["pageview"] > 2 and self.metadata["pageview"] % 100: + return + if not self.ENABLE_AUTH and self.SCRAPYD_SERVERS_AMOUNT == 1: + flash("Set 'ENABLE_AUTH = True' to enable basic auth for web UI", self.INFO) + if self.IS_LOCAL_SCRAPYD_SERVER: + if not self.LOCAL_SCRAPYD_LOGS_DIR: + flash( + ( + "Set up the LOCAL_SCRAPYD_LOGS_DIR option to speed up the loading of scrapy logfiles " + "for the LOCAL_SCRAPYD_SERVER %s" % self.SCRAPYD_SERVER + ), + self.WARN, + ) + if not self.ENABLE_LOGPARSER: + flash( + "Set 'ENABLE_LOGPARSER = True' to run LogParser as a subprocess at startup", + self.WARN, + ) + if not self.ENABLE_MONITOR and self.SCRAPYD_SERVERS_AMOUNT == 1: + flash( + "Set 'ENABLE_MONITOR = True' to enable the monitor feature", self.INFO + ) + + # stats.json by LogParser + # { + # "status_code": 200, + # "status": "ok", + # "datas": { + # "demo": { + # "test": { + # "2019-01-01T0_00_01": { + # "pages": 3, + # "items": 2, + def get_liststats_datas(self): + # NOTE: get_response_from_view() would update g.url_jobs_list, unexpected for mobileui + # request.url: http://localhost/1/api/liststats/ + # TODO: test https + url_liststats = url_for("api", node=self.node, opt="liststats") + js = self.get_response_from_view(url_liststats, as_json=True) + if js["status"] == self.OK: + self.liststats_datas = js.pop("datas", {}) + self.logger.debug( + "Got datas with %s entries from liststats: %s", + len(self.liststats_datas), + js, + ) + else: + self.logger.warning( + "Fail to get datas from liststats: (%s) %s %s", + js["status_code"], + js["status"], + js.get("tip", ""), + ) + + def create_table(self): + self.Job = jobs_table_map.get(self.node, None) + if self.Job is not None: + self.logger.debug("Got table: %s", self.Job.__tablename__) + else: + self.Job = create_jobs_table( + re.sub(STRICT_NAME_PATTERN, "_", self.SCRAPYD_SERVER) + ) + # sqlite3.OperationalError: table "127_0_0_1_6800" already exists + db.create_all(bind="jobs") + self.metadata[self.node] = self.Job + jobs_table_map[self.node] = self.Job + self.logger.debug("Created table: %s", self.Job.__tablename__) + + def handle_jobs_with_db(self): + try: + if request.args.get("raise_exception") == "True": # For test only + assert False, "raise_exception: True" + self.handle_unique_constraint() + self.create_table() + self.db_insert_jobs() + self.db_clean_pending_jobs() + self.query_jobs() + except Exception as err: + self.logger.error( + "Fail to persist jobs in database: %s", traceback.format_exc() + ) + db.session.rollback() + flash("Fail to persist jobs in database: %s" % err, self.WARN) + # sqlalchemy.exc.InvalidRequestError: Table '127_0_0_1_6800' is already defined for this MetaData instance. + # Specify 'extend_existing=True' to redefine options and columns on an existing Table object. + if "is already defined for this MetaData instance" in str(err): + flash( + "Please restart ScrapydWeb to work around this occasional bug!", + self.WARN, + ) + if self.style == "database" and not self.POST: + self.style = "classic" + self.template = "scrapydweb/jobs_classic.html" + self.metadata["style"] = self.style + handle_metadata("jobs_style", self.style) + msg = "Change style to %s" % self.style + self.logger.info(msg) + # flash(msg, self.WARN) + + # Note that there may be jobs with the same combination of (project, spider, job) in the fetched Jobs + def handle_unique_constraint(self): + seen_jobs = OrderedDict() + for job in self.jobs: # (Pending, Running) ASC + if job["finish"]: + break + unique_key = (job["project"], job["spider"], job["job"]) + if unique_key in seen_jobs: # ignore previous + start = seen_jobs[unique_key]["start"] + finish = seen_jobs[unique_key]["finish"] + unique_key_string = "/".join( + list(unique_key) + [start, finish, str(self.node)] + ) + if start: + msg = "Ignore seen running job: %s, started at %s" % ( + "/".join(unique_key), + start, + ) + else: + msg = "Ignore seen pending job: %s" % ("/".join(unique_key)) + self.logger.debug(msg) + if ( + unique_key_string not in self.metadata["unique_key_strings"] + ): # flash only once + self.metadata["unique_key_strings"][unique_key_string] = None + flash(msg, self.WARN if start else self.INFO) + seen_jobs.pop(unique_key) + seen_jobs[unique_key] = job + for job in reversed(self.jobs): # Finished DESC + if not job["finish"]: + break + unique_key = (job["project"], job["spider"], job["job"]) + if unique_key in seen_jobs: # ignore current + unique_key_string = "/".join( + list(unique_key) + [job["start"], job["finish"], str(self.node)] + ) + msg = "Ignore seen finished job: %s, started at %s" % ( + "/".join(unique_key), + job["start"], + ) + self.logger.debug(msg) + if unique_key_string not in self.metadata["unique_key_strings"]: + self.metadata["unique_key_strings"][unique_key_string] = None + flash(msg, self.INFO) + else: + seen_jobs[unique_key] = job + self.jobs = list(seen_jobs.values()) + + def db_insert_jobs(self): + records = [] + for job in self.jobs: # set(self.jobs): unhashable type: 'dict' + record = self.Job.query.filter_by( + project=job["project"], spider=job["spider"], job=job["job"] + ).first() + if record: + self.logger.debug("Found job in database: %s", record) + if record.deleted == DELETED: + if ( + record.status == STATUS_FINISHED + and str(record.start) == job["start"] + ): + self.logger.info("Ignore deleted job: %s", record) + continue + else: + record.deleted = NOT_DELETED + record.pages = None + record.items = None + self.logger.info("Recover deleted job: %s", record) + flash("Recover deleted job: %s" % job, self.WARN) + else: + record = self.Job() + records.append(record) + for k, v in job.items(): + v = v or None # Save NULL in database for empty string + if k in ["start", "finish"]: + v = ( + datetime.strptime(v, "%Y-%m-%d %H:%M:%S") if v else None + ) # Avoid empty string + elif k in [ + "href_log", + "href_items", + ]: # Log + m = re.search(HREF_PATTERN, v) if v else None + v = m.group(1) if m else v + setattr(record, k, v) + if not job["start"]: + record.status = STATUS_PENDING + elif not job["finish"]: + record.status = STATUS_RUNNING + else: + record.status = STATUS_FINISHED + if not job["start"]: + record.pages = None + record.items = None + elif self.liststats_datas: + try: + data = self.liststats_datas[job["project"]][job["spider"]][ + job["job"] + ] + record.pages = data["pages"] # Logparser: None or non-negative int + record.items = data["items"] # Logparser: None or non-negative int + except KeyError: + pass + except Exception as err: + self.logger.error(err) + # SQLite DateTime type only accepts Python datetime and date objects as input + record.update_time = datetime.now() # datetime.now().replace(microsecond=0) + # https://www.reddit.com/r/flask/comments/3tea4k/af_flasksqlalchemy_bulk_updateinsert/ + db.session.add_all(records) + db.session.commit() + + def db_clean_pending_jobs(self): + current_pending_jobs = [ + (job["project"], job["spider"], job["job"]) + for job in self.jobs_backup + if not job["start"] + ] + for record in self.Job.query.filter_by(start=None).all(): + if (record.project, record.spider, record.job) not in current_pending_jobs: + db.session.delete(record) + db.session.commit() + self.logger.info("Deleted pending jobs %s", record) + + def query_jobs(self): + current_running_job_pids = [ + int(job["pid"]) for job in self.jobs_backup if job["pid"] + ] + self.logger.debug("current_running_job_pids: %s", current_running_job_pids) + self.jobs_pagination = ( + self.Job.query.filter_by(deleted=NOT_DELETED) + .order_by( + self.Job.status.asc(), + self.Job.finish.desc(), + self.Job.start.asc(), + self.Job.id.asc(), + ) + .paginate(page=self.page, per_page=self.per_page, error_out=False) + ) + with db.session.no_autoflush: + for index, job in enumerate( + self.jobs_pagination.items, + (self.jobs_pagination.page - 1) * self.jobs_pagination.per_page + 1, + ): + # print(vars(job)) + job.index = index + job.pid = job.pid or "" + job.start = job.start or "" # None for Pending jobs + job.runtime = job.runtime or "" + job.finish = job.finish or "" # None for Pending and Running jobs + job.update_time = self.remove_microsecond(job.update_time) + job.to_be_killed = ( + True + if job.pid and job.pid not in current_running_job_pids + else False + ) + if job.finish: + job.url_multinode = url_for( + "servers", + node=self.node, + opt="schedule", + project=job.project, + version_job=self.DEFAULT_LATEST_VERSION, + spider=job.spider, + ) + job.url_action = url_for( + "schedule", + node=self.node, + project=job.project, + version=self.DEFAULT_LATEST_VERSION, + spider=job.spider, + ) + else: + job.url_multinode = url_for( + "servers", + node=self.node, + opt="stop", + project=job.project, + version_job=job.job, + ) + job.url_action = url_for( + "api", + node=self.node, + opt="stop", + project=job.project, + version_spider_job=job.job, + ) + if job.start: + job.pages = self.NA if job.pages is None else job.pages # May be 0 + job.items = self.NA if job.items is None else job.items # May be 0 + else: # Pending + job.pages = None # from Running/Finished to Pending + job.items = None + continue + job_finished = "True" if job.finish else None + job.url_utf8 = url_for( + "log", + node=self.node, + opt="utf8", + project=job.project, + ui=self.UI, + spider=job.spider, + job=job.job, + job_finished=job_finished, + ) + job.url_stats = url_for( + "log", + node=self.node, + opt="stats", + project=job.project, + ui=self.UI, + spider=job.spider, + job=job.job, + job_finished=job_finished, + ) + job.url_clusterreports = url_for( + "clusterreports", + node=self.node, + project=job.project, + spider=job.spider, + job=job.job, + ) + # '/items/demo/test/2018-10-12_205507.log' + job.url_source = urljoin(self.public_url or self.url, job.href_log) + if job.href_items: + job.url_items = urljoin(self.public_url or self.url, job.href_items) + else: + job.url_items = "" + job.url_delete = url_for( + "jobs.xhr", node=self.node, action="delete", id=job.id + ) + + def set_jobs_dict(self): + for ( + job + ) in ( + self.jobs_pagination.items + ): # Pagination obj in handle_jobs_with_db() > query_jobs() + key = "%s/%s/%s" % (job.project, job.spider, job.job) + value = dict( + (k, v) for (k, v) in job.__dict__.items() if not k.startswith("_") + ) + for k, v in value.items(): + if k in ["create_time", "update_time", "start", "finish"]: + value[k] = str(value[k]) + self.jobs_dict[key] = value + + def handle_jobs_without_db(self): + for job in self.jobs: + job["start"] = job["start"][5:] + job["finish"] = job["finish"][5:] + if not job["start"]: + self.pending_jobs.append(job) + else: + if job["finish"]: + self.finished_jobs.append(job) + job["url_multinode_run"] = url_for( + "servers", + node=self.node, + opt="schedule", + project=job["project"], + version_job=self.DEFAULT_LATEST_VERSION, + spider=job["spider"], + ) + job["url_schedule"] = url_for( + "schedule", + node=self.node, + project=job["project"], + version=self.DEFAULT_LATEST_VERSION, + spider=job["spider"], + ) + job["url_start"] = url_for( + "api", + node=self.node, + opt="start", + project=job["project"], + version_spider_job=job["spider"], + ) + else: + self.running_jobs.append(job) + job["url_forcestop"] = url_for( + "api", + node=self.node, + opt="forcestop", + project=job["project"], + version_spider_job=job["job"], + ) + + job_finished = "True" if job["finish"] else None + job["url_utf8"] = url_for( + "log", + node=self.node, + opt="utf8", + project=job["project"], + ui=self.UI, + spider=job["spider"], + job=job["job"], + job_finished=job_finished, + ) + job["url_stats"] = url_for( + "log", + node=self.node, + opt="stats", + project=job["project"], + ui=self.UI, + spider=job["spider"], + job=job["job"], + job_finished=job_finished, + ) + job["url_clusterreports"] = url_for( + "clusterreports", + node=self.node, + project=job["project"], + spider=job["spider"], + job=job["job"], + ) + # Items + m = re.search(HREF_PATTERN, job["href_items"]) + if m: + job["url_items"] = urljoin(self.public_url or self.url, m.group(1)) + else: + job["url_items"] = "" + + if not job["finish"]: + job["url_multinode_stop"] = url_for( + "servers", + node=self.node, + opt="stop", + project=job["project"], + version_job=job["job"], + ) + job["url_stop"] = url_for( + "api", + node=self.node, + opt="stop", + project=job["project"], + version_spider_job=job["job"], + ) + + def set_kwargs(self): + self.kwargs = dict( + node=self.node, + url=self.url, + url_schedule=url_for("schedule", node=self.node), + url_liststats=url_for("api", node=self.node, opt="liststats"), + url_liststats_source="http://%s/logs/stats.json" % self.SCRAPYD_SERVER, + SCRAPYD_SERVER=self.SCRAPYD_SERVER.split(":")[0], + LOGPARSER_VERSION=self.LOGPARSER_VERSION, + JOBS_RELOAD_INTERVAL=self.JOBS_RELOAD_INTERVAL, + LONG_RUNNING_SCRAPER_STOP_INTERVAL=self.LONG_RUNNING_SCRAPER_STOP_INTERVAL, + IS_IE_EDGE=self.IS_IE_EDGE, + pageview=self.metadata["pageview"], + FEATURES=self.FEATURES, + ) + if self.style == "database": + self.kwargs.update( + dict( + url_jobs_classic=url_for("jobs", node=self.node, style="classic"), + jobs=self.jobs_pagination, + ) + ) + return + + if self.JOBS_FINISHED_JOBS_LIMIT > 0: + self.finished_jobs = self.finished_jobs[::-1][ + : self.JOBS_FINISHED_JOBS_LIMIT + ] + else: + self.finished_jobs = self.finished_jobs[::-1] + self.kwargs.update( + dict( + colspan=14, + url_jobs_database=url_for("jobs", node=self.node, style="database"), + pending_jobs=self.pending_jobs, + running_jobs=self.running_jobs, + finished_jobs=self.finished_jobs, + SHOW_JOBS_JOB_COLUMN=self.SHOW_JOBS_JOB_COLUMN, + ) + ) + + +class JobsXhrView(BaseView): + metadata = metadata + + def __init__(self): + super(JobsXhrView, self).__init__() + + self.action = self.view_args["action"] # delete + self.id = self.view_args["id"] # + + self.js = {} + self.Job = jobs_table_map[self.node] # database class Job + + def dispatch_request(self, **kwargs): + job = self.Job.query.get(self.id) + if job: + try: + job.deleted = DELETED + db.session.commit() + except Exception as err: + self.logger.error(traceback.format_exc()) + db.session.rollback() + self.js["status"] = self.ERROR + self.js["message"] = str(err) + else: + self.js["status"] = self.OK + self.logger.info(self.js.setdefault("tip", "Deleted %s" % job)) + else: + self.js["status"] = self.ERROR + self.js["message"] = "job #%s not found in the database" % self.id + + return self.json_dumps(self.js, as_response=True) diff --git a/views/dashboard/node_reports.py b/views/dashboard/node_reports.py new file mode 100644 index 00000000..8a81fabc --- /dev/null +++ b/views/dashboard/node_reports.py @@ -0,0 +1,53 @@ +# coding: utf-8 +import json + +from flask import render_template, url_for + +from ..baseview import BaseView + + +class NodeReportsView(BaseView): + + def __init__(self): + super(NodeReportsView, self).__init__() + + self.url = url_for('jobs', node=self.node, listjobs='True') + self.text = '' + self.jobs = [] + self.pending_jobs = [] + self.running_jobs = [] + self.finished_jobs = [] + self.template = 'scrapydweb/node_reports.html' + + def dispatch_request(self, **kwargs): + self.text = self.get_response_from_view(self.url, as_json=False) + try: + self.jobs = json.loads(self.text) + except ValueError as err: + self.logger.error("Fail to decode json from %s: %s", self.url, err) + return self.text + + for job in self.jobs: + if not job['start']: + self.pending_jobs.append(job) + else: + if job['finish']: + self.finished_jobs.append(job) + else: + self.running_jobs.append(job) + + if self.JOBS_FINISHED_JOBS_LIMIT > 0: + self.finished_jobs = self.finished_jobs[::-1][:self.JOBS_FINISHED_JOBS_LIMIT] + else: + self.finished_jobs = self.finished_jobs[::-1] + kwargs = dict( + node=self.node, + url=self.url, + pending_jobs=self.pending_jobs, + running_jobs=self.running_jobs, + finished_jobs=self.finished_jobs, + url_report=url_for('log', node=self.node, opt='report', project='PROJECT_PLACEHOLDER', + spider='SPIDER_PLACEHOLDER', job='JOB_PLACEHOLDER'), + url_schedule=url_for('schedule', node=self.node), + ) + return render_template(self.template, **kwargs) diff --git a/views/files/__init__.py b/views/files/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/views/files/items.py b/views/files/items.py new file mode 100644 index 00000000..51dd5f2d --- /dev/null +++ b/views/files/items.py @@ -0,0 +1,82 @@ +# coding: utf-8 +import os +import re + +from flask import render_template, url_for + +from ...vars import DIRECTORY_KEYS, DIRECTORY_PATTERN, HREF_NAME_PATTERN +from ..baseview import BaseView + + +class ItemsView(BaseView): + methods = ['GET'] + + def __init__(self): + super(ItemsView, self).__init__() + + self.project = self.view_args['project'] + self.spider = self.view_args['spider'] + + self.url = 'http://{}/items/{}{}'.format(self.SCRAPYD_SERVER, + '%s/' % self.project if self.project else '', + '%s/' % self.spider if self.spider else '') + if self.SCRAPYD_SERVER_PUBLIC_URL: + self.public_url = re.sub(r'^http.*?/items/', self.SCRAPYD_SERVER_PUBLIC_URL + '/items/', self.url) + else: + self.public_url = '' + self.template = 'scrapydweb/logs_items.html' + self.text = '' + + def dispatch_request(self, **kwargs): + status_code, self.text = self.make_request(self.url, auth=self.AUTH, as_json=False) + if status_code != 200 or not re.search(r'Directory listing for /items/', self.text): + kwargs = dict( + node=self.node, + url=self.url, + status_code=status_code, + text=self.text, + tip="Click the above link to make sure your Scrapyd server is accessable. " + ) + return render_template(self.template_fail, **kwargs) + + return self.generate_response() + + def generate_response(self): + rows = [dict(zip(DIRECTORY_KEYS, row)) for row in re.findall(DIRECTORY_PATTERN, self.text)] + for row in rows: + # demo/ dir + # test/ dir + # a.jl file + row['href'], row['filename'] = re.search(HREF_NAME_PATTERN, row['filename']).groups() + if not row['href'].endswith('/'): # It's a file but not a directory + row['href'] = (self.public_url or self.url) + row['href'] + + if self.project and self.spider: + if row['filename'].endswith('.tar.gz'): + filename_without_ext = row['filename'][:-len('.tar.gz')] + else: + filename_without_ext = os.path.splitext(row['filename'])[0] # '1.1.jl' => ('1.1', '.jl') + row['url_stats'] = url_for('log', node=self.node, opt='stats', project=self.project, + spider=self.spider, job=filename_without_ext) + row['url_utf8'] = url_for('log', node=self.node, opt='utf8', project=self.project, + spider=self.spider, job=filename_without_ext) + row['url_clusterreports'] = url_for('clusterreports', node=self.node, project=self.project, + spider=self.spider, job=self.get_job_without_ext(row['filename'])) + if self.project and self.spider: + url_schedule = url_for('schedule', node=self.node, project=self.project, + version=self.DEFAULT_LATEST_VERSION, spider=self.spider) + url_multinode_run = url_for('servers', node=self.node, opt='schedule', project=self.project, + version_job=self.DEFAULT_LATEST_VERSION, spider=self.spider) + else: + url_schedule = url_multinode_run = '' + kwargs = dict( + node=self.node, + title='items', + project=self.project, + spider=self.spider, + url=self.url, + url_schedule=url_schedule, + url_multinode_run=url_multinode_run, + rows=rows + ) + return render_template(self.template, **kwargs) diff --git a/views/files/log.py b/views/files/log.py new file mode 100644 index 00000000..4cb76a51 --- /dev/null +++ b/views/files/log.py @@ -0,0 +1,544 @@ +# coding: utf-8 +from collections import OrderedDict, defaultdict +from datetime import date, datetime +import io +import json +import os +import re +from subprocess import Popen +import sys +import tarfile +import time + +from flask import flash, get_flashed_messages, render_template, request, url_for +from logparser import parse + +from ...vars import ROOT_DIR +from ..baseview import BaseView + + +EMAIL_CONTENT_KEYS = [ + 'log_critical_count', + 'log_error_count', + 'log_warning_count', + 'log_redirect_count', + 'log_retry_count', + 'log_ignore_count', + 'crawled_pages', + 'scraped_items' +] +job_data_dict = {} +# job_finished_key_dict would only be updated by poll POST with ?job_finished=True > monitor_alert(), +# used for determining whether to show 'click to refresh' button in the Log and Stats page. +job_finished_key_dict = defaultdict(OrderedDict) +# For /log/report/ +job_finished_report_dict = defaultdict(OrderedDict) +REPORT_KEYS_SET = {'from_memory', 'status', 'pages', 'items', 'shutdown_reason', 'finish_reason', 'runtime', + 'first_log_time', 'latest_log_time', 'log_categories', 'latest_matches'} + + +# http://flask.pocoo.org/docs/1.0/api/#flask.views.View +# http://flask.pocoo.org/docs/1.0/views/ +class LogView(BaseView): + + def __init__(self): + super(LogView, self).__init__() # super().__init__() + + self.opt = self.view_args['opt'] + self.project = self.view_args['project'] + self.spider = self.view_args['spider'] + self.job = self.view_args['job'] + + self.job_key = '/%s/%s/%s/%s' % (self.node, self.project, self.spider, self.job) + + # Note that self.LOCAL_SCRAPYD_LOGS_DIR may be an empty string + # Extension like '.log' is excluded here. + self.url = u'http://{}/logs/{}/{}/{}'.format(self.SCRAPYD_SERVER, self.project, self.spider, self.job) + self.log_path = os.path.join(self.LOCAL_SCRAPYD_LOGS_DIR, self.project, self.spider, self.job) + + # For Log and Stats buttons in the Logs page: /a.log/?with_ext=True + self.with_ext = request.args.get('with_ext', None) + if self.with_ext: + self.SCRAPYD_LOG_EXTENSIONS = [''] + job_without_ext = self.get_job_without_ext(self.job) + else: + job_without_ext = self.job + + # json file by LogParser + self.json_path = os.path.join(self.LOCAL_SCRAPYD_LOGS_DIR, self.project, self.spider, job_without_ext+'.json') + self.json_url = u'http://{}/logs/{}/{}/{}.json'.format(self.SCRAPYD_SERVER, self.project, self.spider, + job_without_ext) + + self.status_code = 0 + self.text = '' + if self.opt == 'report': + self.template = None + else: + self.template = 'scrapydweb/%s%s.html' % (self.opt, '_mobileui' if self.USE_MOBILEUI else '') + self.kwargs = dict(node=self.node, project=self.project, spider=self.spider, + job=job_without_ext, url_refresh='', url_jump='') + + # Request that comes from poll POST for finished job and links of finished job in the Jobs page + # would be attached with the query string '?job_finished=True' + self.job_finished = request.args.get('job_finished', None) + + self.utf8_realtime = False + self.stats_realtime = False + self.stats_logparser = False + self.report_logparser = False + if self.opt == 'utf8': + flash("It's recommended to check out the latest log via: the Stats page >> View log >> Tail", self.WARN) + self.utf8_realtime = True + elif self.opt == 'stats': + self.stats_realtime = True if request.args.get('realtime', None) else False + self.stats_logparser = not self.stats_realtime + else: + self.report_logparser = True + self.logparser_valid = False + self.backup_stats_valid = False + spider_path = self.mkdir_spider_path() + self.backup_stats_path = os.path.join(spider_path, job_without_ext + '.json') + self.stats = {} + + # job_data for monitor & alert: ([0] * 8, [False] * 6, False, time.time()) + self.job_stats_previous = [] + self.triggered_list = [] + self.has_been_stopped = False + self.last_send_timestamp = 0 + self.job_stats = [] + self.job_stats_diff = [] + # For compatibility with Python 2, use OrderedDict() to keep insertion order + self.email_content_kwargs = OrderedDict() + self.flag = '' + + self.jobs_to_keep = self.JOBS_FINISHED_JOBS_LIMIT or 200 + + def dispatch_request(self, **kwargs): + if self.report_logparser: + self.read_stats_for_report() + # Try to request stats by LogParser to avoid reading/requesting the whole log + if not self.logparser_valid and (self.stats_logparser or self.report_logparser): + if self.IS_LOCAL_SCRAPYD_SERVER and self.LOCAL_SCRAPYD_LOGS_DIR: + self.read_local_stats_by_logparser() + if not self.logparser_valid: + self.request_stats_by_logparser() + + if not self.logparser_valid and not self.text: + # Try to read local logfile + if self.IS_LOCAL_SCRAPYD_SERVER and self.LOCAL_SCRAPYD_LOGS_DIR: + self.read_local_scrapy_log() + # Has to request scrapy logfile + if not self.text: + self.request_scrapy_log() + if self.status_code != 200: + if self.stats_logparser or self.report_logparser: + self.load_backup_stats() + if not self.backup_stats_valid: + if not self.report_logparser: + kwargs = dict(node=self.node, url=self.url, status_code=self.status_code, text=self.text) + return render_template(self.template_fail, **kwargs) + else: + self.url += self.SCRAPYD_LOG_EXTENSIONS[0] + else: + self.url += self.SCRAPYD_LOG_EXTENSIONS[0] + + if (not self.utf8_realtime + and not self.logparser_valid + and self.text + and self.status_code in [0, 200]): + self.logger.warning('Parse the whole log') + self.stats = parse(self.text) + # Note that the crawler_engine is not available when using parse() + self.stats.setdefault('crawler_engine', {}) + self.stats.setdefault('status', self.OK) + + if self.report_logparser: + if self.stats and not self.stats.setdefault('from_memory', False): + self.simplify_stats_for_report() + self.keep_stats_for_report() + get_flashed_messages() + # 0, -1, 404 load backup + if self.status_code < 100 or self.stats: + status_code = 200 + else: + status_code = self.status_code + return self.json_dumps(self.stats or dict(status='error'), as_response=True), status_code + else: + self.update_kwargs() + if self.ENABLE_MONITOR and self.POST: # Only poll.py would make POST request + self.monitor_alert() + return render_template(self.template, **self.kwargs) + + def read_local_stats_by_logparser(self): + self.logger.debug("Try to read local stats by LogParser: %s", self.json_path) + try: + with io.open(self.json_path, 'r', encoding='utf-8') as f: + js = json.loads(f.read()) + except Exception as err: + self.logger.error("Fail to read local stats from %s: %s", self.json_path, err) + return + else: + if js.get('logparser_version') != self.LOGPARSER_VERSION: + msg = "Mismatching logparser_version %s in local stats" % js.get('logparser_version') + self.logger.warning(msg) + flash(msg, self.WARN) + return + self.logparser_valid = True + self.stats = js + msg = "Using local stats: LogParser v%s, last updated at %s, %s" % ( + js['logparser_version'], js['last_update_time'], self.handle_slash(self.json_path)) + self.logger.info(msg) + flash(msg, self.INFO) + + def request_stats_by_logparser(self): + self.logger.debug("Try to request stats by LogParser: %s", self.json_url) + # self.make_request() would check the value of key 'status' if as_json=True + status_code, js = self.make_request(self.json_url, auth=self.AUTH, as_json=True, dumps_json=False) + if status_code != 200: + self.logger.error("Fail to request stats from %s, got status_code: %s", self.json_url, status_code) + if self.IS_LOCAL_SCRAPYD_SERVER and self.ENABLE_LOGPARSER: + flash("Request to %s got code %s, wait until LogParser parses the log. " % (self.json_url, status_code), + self.INFO) + else: + flash(("'pip install logparser' on host '%s' and run command 'logparser'. " + "Or wait until LogParser parses the log. ") % self.SCRAPYD_SERVER, self.WARN) + return + elif js.get('logparser_version') != self.LOGPARSER_VERSION: + msg = "'pip install --upgrade logparser' on host '%s' to update LogParser to v%s" % ( + self.SCRAPYD_SERVER, self.LOGPARSER_VERSION) + self.logger.warning(msg) + flash(msg, self.WARN) + return + else: + self.logparser_valid = True + # TODO: dirty data + self.stats = js + msg = "LogParser v%s, last updated at %s, %s" % ( + js['logparser_version'], js['last_update_time'], self.json_url) + self.logger.info(msg) + flash(msg, self.INFO) + + def read_local_scrapy_log(self): + for ext in self.SCRAPYD_LOG_EXTENSIONS: + log_path = self.log_path + ext + if os.path.exists(log_path): + if tarfile.is_tarfile(log_path): + self.logger.debug("Ignore local tarfile and use requests instead: %s", log_path) + break + with io.open(log_path, 'r', encoding='utf-8', errors='ignore') as f: + self.text = f.read() + log_path = self.handle_slash(log_path) + msg = "Using local logfile: %s" % log_path + self.logger.debug(msg) + flash(msg, self.INFO) + break + + def request_scrapy_log(self): + for ext in self.SCRAPYD_LOG_EXTENSIONS: + url = self.url + ext + self.status_code, self.text = self.make_request(url, auth=self.AUTH, as_json=False) + if self.status_code == 200: + self.url = url + self.logger.debug("Got logfile from %s", self.url) + break + else: + msg = "Fail to request logfile from %s with extensions %s" % (self.url, self.SCRAPYD_LOG_EXTENSIONS) + self.logger.error(msg) + flash(msg, self.WARN) + self.url += self.SCRAPYD_LOG_EXTENSIONS[0] + + def simplify_stats_for_report(self): + for key in list(self.stats.keys()): + if key not in REPORT_KEYS_SET: + self.stats.pop(key) + try: + for key in self.stats['log_categories']: + self.stats['log_categories'][key] = dict(count=self.stats['log_categories'][key]['count']) + except KeyError: + pass + try: + self.stats['latest_matches'] = dict(latest_item=self.stats['latest_matches']['latest_item']) + except KeyError: + pass + + def keep_stats_for_report(self): + od = job_finished_report_dict[self.node] + if self.job_key in od: + return + if (self.stats.get('shutdown_reason', self.NA) == self.NA + and self.stats.get('finish_reason', self.NA) == self.NA): + return + if set(self.stats.keys()) == REPORT_KEYS_SET: + od[self.job_key] = self.stats + if len(od) > self.jobs_to_keep: + od.popitem(last=False) + self.logger.debug("%s keys in job_finished_report_dict[%s]", len(od), self.node) + + def read_stats_for_report(self): + try: + self.stats = job_finished_report_dict[self.node][self.job_key] + except KeyError: + self.logger.debug("%s not found in job_finished_report_dict[%s]", self.job_key, self.node) + else: + self.logger.debug("%s found in job_finished_report_dict[%s]", self.job_key, self.node) + self.logparser_valid = True + self.stats['from_memory'] = True + + def mkdir_spider_path(self): + node_path = os.path.join(self.STATS_PATH, + re.sub(self.LEGAL_NAME_PATTERN, '-', re.sub(r'[.:]', '_', self.SCRAPYD_SERVER))) + project_path = os.path.join(node_path, self.project) + spider_path = os.path.join(project_path, self.spider) + + if not os.path.isdir(self.STATS_PATH): + os.mkdir(self.STATS_PATH) + if not os.path.isdir(node_path): + os.mkdir(node_path) + if not os.path.isdir(project_path): + os.mkdir(project_path) + if not os.path.isdir(spider_path): + os.mkdir(spider_path) + return spider_path + + def backup_stats(self): + # TODO: delete backup stats json file when the job is deleted in the Jobs page with database view + try: + with io.open(self.backup_stats_path, 'w', encoding='utf-8', errors='ignore') as f: + f.write(self.json_dumps(self.stats)) + except Exception as err: + self.logger.error("Fail to backup stats to %s: %s" % (self.backup_stats_path, err)) + try: + os.remove(self.backup_stats_path) + except: + pass + else: + self.logger.info("Saved backup stats to %s", self.backup_stats_path) + + def load_backup_stats(self): + self.logger.debug("Try to load backup stats by LogParser: %s", self.json_path) + try: + with io.open(self.backup_stats_path, 'r', encoding='utf-8') as f: + js = json.loads(f.read()) + except Exception as err: + self.logger.error("Fail to load backup stats from %s: %s", self.backup_stats_path, err) + else: + if js.get('logparser_version') != self.LOGPARSER_VERSION: + msg = "Mismatching logparser_version %s in backup stats" % js.get('logparser_version') + self.logger.warning(msg) + flash(msg, self.WARN) + return + self.logparser_valid = True + self.backup_stats_valid = True + self.stats = js + msg = "Using backup stats: LogParser v%s, last updated at %s, %s" % ( + js['logparser_version'], js['last_update_time'], self.handle_slash(self.backup_stats_path)) + self.logger.info(msg) + flash(msg, self.WARN) + + @staticmethod + def get_ordered_dict(adict): + # 'source', 'last_update_time', 'last_update_timestamp', other keys in order + odict = OrderedDict() + for k in ['source', 'last_update_time', 'last_update_timestamp']: + odict[k] = adict.pop(k) + for k in sorted(adict.keys()): + odict[k] = adict[k] + return odict + + def update_kwargs(self): + if self.utf8_realtime: + self.kwargs['text'] = self.text + self.kwargs['last_update_timestamp'] = time.time() + if self.job_finished or self.job_key in job_finished_key_dict[self.node]: + self.kwargs['url_refresh'] = '' + else: + self.kwargs['url_refresh'] = 'javascript:location.reload(true);' + else: + # Parsed data comes from json.loads, for compatibility with Python 2, + # use str(time_) to avoid [u'2019-01-01 00:00:01', 0, 0, 0, 0] in JavaScript. + for d in self.stats['datas']: + d[0] = str(d[0]) + # For sorted orders in stats.html with Python 2 + for k in ['crawler_stats', 'crawler_engine']: + if self.stats[k]: + self.stats[k] = self.get_ordered_dict(self.stats[k]) + + if self.BACKUP_STATS_JSON_FILE: + self.backup_stats() + self.kwargs.update(self.stats) + + if (self.kwargs['finish_reason'] == self.NA + and not self.job_finished + and self.job_key not in job_finished_key_dict[self.node]): + # http://flask.pocoo.org/docs/1.0/api/#flask.Request.url_root + # _query_string = '?ui=mobile' + # self.url_refresh = request.script_root + request.path + _query_string + self.kwargs['url_refresh'] = 'javascript:location.reload(true);' + if self.kwargs['url_refresh']: + if self.stats_logparser and not self.logparser_valid: + self.kwargs['url_jump'] = '' + else: + self.kwargs['url_jump'] = url_for('log', node=self.node, opt='stats', project=self.project, + spider=self.spider, job=self.job, with_ext=self.with_ext, + ui=self.UI, realtime='True' if self.stats_logparser else None) + + # Stats link of 'a.json' from the Logs page should hide these links + if self.with_ext and self.job.endswith('.json'): + self.kwargs['url_source'] = '' + self.kwargs['url_opt_opposite'] = '' + self.kwargs['url_refresh'] = '' + self.kwargs['url_jump'] = '' + else: + if self.SCRAPYD_SERVER_PUBLIC_URL: + self.kwargs['url_source'] = re.sub(r'^http.*?/logs/', self.SCRAPYD_SERVER_PUBLIC_URL + '/logs/', + self.url) + else: + self.kwargs['url_source'] = self.url + self.kwargs['url_opt_opposite'] = url_for('log', node=self.node, + opt='utf8' if self.opt == 'stats' else 'stats', + project=self.project, spider=self.spider, job=self.job, + job_finished=self.job_finished, with_ext=self.with_ext, + ui=self.UI) + + # TODO: https://blog.miguelgrinberg.com/post/the-flask-mega-tutorial-part-x-email-support + def monitor_alert(self): + job_data_default = ([0] * 8, [False] * 6, False, time.time()) + job_data = job_data_dict.setdefault(self.job_key, job_data_default) + (self.job_stats_previous, self.triggered_list, self.has_been_stopped, self.last_send_timestamp) = job_data + self.logger.debug(job_data_dict) + self.job_stats = [self.kwargs['log_categories'][k.lower() + '_logs']['count'] + for k in self.ALERT_TRIGGER_KEYS] + self.job_stats.extend([self.kwargs['pages'] or 0, self.kwargs['items'] or 0]) # May be None by LogParser + self.job_stats_diff = [j - i for i, j in zip(self.job_stats_previous, self.job_stats)] + + self.set_email_content_kwargs() + self.set_monitor_flag() + self.send_alert() + self.handle_data() + + def set_email_content_kwargs(self): + self.email_content_kwargs['SCRAPYD_SERVER'] = self.SCRAPYD_SERVER + self.email_content_kwargs['project'] = self.kwargs['project'] + self.email_content_kwargs['spider'] = self.kwargs['spider'] + self.email_content_kwargs['job'] = self.kwargs['job'] + self.email_content_kwargs['first_log_time'] = self.kwargs['first_log_time'] + self.email_content_kwargs['latest_log_time'] = self.kwargs['latest_log_time'] + self.email_content_kwargs['runtime'] = self.kwargs['runtime'] + self.email_content_kwargs['shutdown_reason'] = self.kwargs['shutdown_reason'] + self.email_content_kwargs['finish_reason'] = self.kwargs['finish_reason'] + self.email_content_kwargs['url_stats'] = request.url + '%sui=mobile' % '&' if request.args else '?' + + for idx, key in enumerate(EMAIL_CONTENT_KEYS): + if self.job_stats_diff[idx]: + self.email_content_kwargs[key] = '%s + %s' % (self.job_stats_previous[idx], self.job_stats_diff[idx]) + else: + self.email_content_kwargs[key] = self.job_stats[idx] + # pages and items may be None by LogParser + if self.kwargs['pages'] is None: + self.email_content_kwargs['crawled_pages'] = self.NA + if self.kwargs['items'] is None: + self.email_content_kwargs['scraped_items'] = self.NA + + _url_stop = url_for('api', node=self.node, opt='stop', project=self.project, version_spider_job=self.job) + self.email_content_kwargs['url_stop'] = self.URL_SCRAPYDWEB + _url_stop + + now_timestamp = time.time() + for k in ['latest_crawl', 'latest_scrape', 'latest_log']: + ts = self.kwargs['%s_timestamp' % k] + self.email_content_kwargs[k] = self.NA if ts == 0 else "%s secs ago" % int(now_timestamp - ts) + + self.email_content_kwargs['current_time'] = self.get_now_string(True) + self.email_content_kwargs['logparser_version'] = self.kwargs['logparser_version'] + self.email_content_kwargs['latest_item'] = self.kwargs['latest_matches']['latest_item'] or self.NA + self.email_content_kwargs['Crawler.stats'] = self.kwargs['crawler_stats'] + self.email_content_kwargs['Crawler.engine'] = self.kwargs['crawler_engine'] + + def set_monitor_flag(self): + if self.ON_JOB_FINISHED and self.job_finished: + self.flag = 'Finished' + elif not all(self.triggered_list): + to_forcestop = False + to_stop = False + # The order of the elements in ALERT_TRIGGER_KEYS matters: + # ['CRITICAL', 'ERROR', 'WARNING', 'REDIRECT', 'RETRY', 'IGNORE'] + for idx, key in enumerate(self.ALERT_TRIGGER_KEYS): + if (0 < getattr(self, 'LOG_%s_THRESHOLD' % key, 0) <= self.job_stats[idx] + and not self.triggered_list[idx]): + self.triggered_list[idx] = True + self.email_content_kwargs['log_%s_count' % key.lower()] += ' triggered!!!' + if getattr(self, 'LOG_%s_TRIGGER_FORCESTOP' % key): + self.flag = '%s_ForceStop' % key if '_ForceStop' not in self.flag else self.flag + to_forcestop = True + elif getattr(self, 'LOG_%s_TRIGGER_STOP' % key) and not self.has_been_stopped: + self.flag = '%s_Stop' % key if 'Stop' not in self.flag else self.flag + self.has_been_stopped = True # Execute 'Stop' one time at most to avoid unclean shutdown + to_stop = True + elif not self.has_been_stopped: + self.flag = '%s_Trigger' % key if not self.flag else self.flag + if to_forcestop: + self.logger.debug("%s: %s", self.flag, self.job_key) + _url = url_for('api', node=self.node, opt='forcestop', + project=self.project, version_spider_job=self.job) + self.get_response_from_view(_url) + elif to_stop: + self.logger.debug("%s: %s", self.flag, self.job_key) + _url = url_for('api', node=self.node, opt='stop', + project=self.project, version_spider_job=self.job) + self.get_response_from_view(_url) + + if not self.flag and 0 < self.ON_JOB_RUNNING_INTERVAL <= time.time() - self.last_send_timestamp: + self.flag = 'Running' + + def send_alert(self): + if (self.flag + and date.isoweekday(date.today()) in self.ALERT_WORKING_DAYS # date.isoweekday(datetime.now()) + and datetime.now().hour in self.ALERT_WORKING_HOURS + ): + kwargs = dict( + flag=self.flag, + pages=self.NA if self.kwargs['pages'] is None else self.kwargs['pages'], + items=self.NA if self.kwargs['items'] is None else self.kwargs['items'], + job_key=self.job_key, + latest_item=self.kwargs['latest_matches']['latest_item'][:100] or self.NA + ) + subject = u"{flag} [{pages}p, {items}i] {job_key} {latest_item} #scrapydweb".format(**kwargs) + self.EMAIL_KWARGS['subject'] = subject + self.EMAIL_KWARGS['content'] = self.json_dumps(self.email_content_kwargs, sort_keys=False) + + data = dict( + subject=subject, + url_stats=self.email_content_kwargs['url_stats'], + url_stop=self.email_content_kwargs['url_stop'], + when=self.get_now_string(True), + ) + if self.ENABLE_SLACK_ALERT: + self.logger.info("Sending alert via Slack: %s", subject) + _url = url_for('sendtextapi', opt='slack', channel_chatid_subject=None, text=None) + self.get_response_from_view(_url, data=data) + if self.ENABLE_TELEGRAM_ALERT: + self.logger.info("Sending alert via Telegram: %s", subject) + _url = url_for('sendtextapi', opt='telegram', channel_chatid_subject=None, text=None) + self.get_response_from_view(_url, data=data) + if self.ENABLE_EMAIL_ALERT: + self.logger.info("Sending alert via Email: %s", subject) + args = [ + sys.executable, + os.path.join(ROOT_DIR, 'utils', 'send_email.py'), + self.json_dumps(self.EMAIL_KWARGS, ensure_ascii=True) + ] + Popen(args) + + def handle_data(self): + if self.flag: + # Update job_data_dict (last_send_timestamp would be updated only when flag is non-empty) + self.logger.debug("Previous job_data['%s'] %s", self.job_key, job_data_dict[self.job_key]) + job_data_dict[self.job_key] = (self.job_stats, self.triggered_list, self.has_been_stopped, time.time()) + self.logger.debug("Updated job_data['%s'] %s", self.job_key, job_data_dict[self.job_key]) + + if self.job_finished: + job_data_dict.pop(self.job_key) + od = job_finished_key_dict[self.node] + od[self.job_key] = None + if len(od) > self.jobs_to_keep: + od.popitem(last=False) + self.logger.info('job_finished: %s', self.job_key) diff --git a/views/files/logs.py b/views/files/logs.py new file mode 100644 index 00000000..cc3de5c5 --- /dev/null +++ b/views/files/logs.py @@ -0,0 +1,80 @@ +# coding: utf-8 +import re + +from flask import render_template, url_for + +from ...vars import DIRECTORY_KEYS, DIRECTORY_PATTERN, HREF_NAME_PATTERN +from ..baseview import BaseView + + +class LogsView(BaseView): + methods = ['GET'] + + def __init__(self): + super(LogsView, self).__init__() + + self.project = self.view_args['project'] + self.spider = self.view_args['spider'] + + self.url = 'http://{}/logs/{}{}'.format(self.SCRAPYD_SERVER, + '%s/' % self.project if self.project else '', + '%s/' % self.spider if self.spider else '') + if self.SCRAPYD_SERVER_PUBLIC_URL: + self.public_url = re.sub(r'^http.*?/logs/', self.SCRAPYD_SERVER_PUBLIC_URL + '/logs/', self.url) + else: + self.public_url = '' + self.template = 'scrapydweb/logs_items.html' + self.text = '' + + def dispatch_request(self, **kwargs): + status_code, self.text = self.make_request(self.url, auth=self.AUTH, as_json=False) + if status_code != 200 or not re.search(r'Directory listing for /logs/', self.text): + kwargs = dict( + node=self.node, + url=self.url, + status_code=status_code, + text=self.text, + tip="Click the above link to make sure your Scrapyd server is accessable. " + ) + return render_template(self.template_fail, **kwargs) + + return self.generate_response() + + def generate_response(self): + rows = [dict(zip(DIRECTORY_KEYS, row)) for row in re.findall(DIRECTORY_PATTERN, self.text)] + for row in rows: + # demo/ dir + # test/ dir + # a.log file + row['href'], row['filename'] = re.search(HREF_NAME_PATTERN, row['filename']).groups() + if not row['href'].endswith('/'): # It's a file but not a directory + row['href'] = (self.public_url or self.url) + row['href'] + + if self.project and self.spider: + row['url_stats'] = url_for('log', node=self.node, opt='stats', project=self.project, + spider=self.spider, job=row['filename'], with_ext='True') + if row['filename'].endswith('.json'): # stats by LogParser + row['url_utf8'] = '' + else: + row['url_utf8'] = url_for('log', node=self.node, opt='utf8', project=self.project, + spider=self.spider, job=row['filename'], with_ext='True') + row['url_clusterreports'] = url_for('clusterreports', node=self.node, project=self.project, + spider=self.spider, job=self.get_job_without_ext(row['filename'])) + if self.project and self.spider: + url_schedule = url_for('schedule', node=self.node, project=self.project, + version=self.DEFAULT_LATEST_VERSION, spider=self.spider) + url_multinode_run = url_for('servers', node=self.node, opt='schedule', project=self.project, + version_job=self.DEFAULT_LATEST_VERSION, spider=self.spider) + else: + url_schedule = url_multinode_run = '' + kwargs = dict( + node=self.node, + title='logs', + project=self.project, + spider=self.spider, + url=self.url, + url_schedule=url_schedule, + url_multinode_run=url_multinode_run, + rows=rows + ) + return render_template(self.template, **kwargs) diff --git a/views/files/projects.py b/views/files/projects.py new file mode 100644 index 00000000..3ad96a62 --- /dev/null +++ b/views/files/projects.py @@ -0,0 +1,121 @@ +# coding: utf-8 +import datetime +import json + +from flask import render_template, request, url_for + +from ..baseview import BaseView + + +class ProjectsView(BaseView): + + def __init__(self): + super(ProjectsView, self).__init__() + + self.opt = self.view_args['opt'] + self.project = self.view_args['project'] + self.version_spider_job = self.view_args['version_spider_job'] + + self.text = '' + self.js = {} + + def dispatch_request(self, **kwargs): + # self.text = api(self.node, self.opt, self.project, self.version_spider_job) + _url = url_for('api', node=self.node, opt=self.opt, project=self.project, + version_spider_job=self.version_spider_job) # '/1/api/listprojects/' + self.text = self.get_response_from_view(_url, as_json=False) + # _bind = '127.0.0.1' if self.SCRAPYDWEB_BIND == '0.0.0.0' else self.SCRAPYDWEB_BIND + # _url = 'http://{}:{}{}'.format(_bind, self.SCRAPYDWEB_PORT, _url) + # _auth = (self.USERNAME, self.PASSWORD) if self.ENABLE_AUTH else None + # status_code, self.text = self.make_request(_url, auth=_auth, as_json=False) + self.js = json.loads(self.text) + + if self.js['status'] == self.OK: + return getattr(self, self.opt)() + else: + return self.handle_status_error() + + @staticmethod + def delproject(): + return 'project deleted' + + @staticmethod + def delversion(): + return 'version deleted' + + def handle_status_error(self): + if self.opt == 'listversions': + kwargs = dict( + url=self.js['url'], + status=self.js['status'], + url_deploy=url_for('deploy', node=self.node), + url_delproject=url_for('projects', node=self.node, opt='delproject', project=self.project), + project=self.project, + text=self.text, + tip=self.js.get('tip', '') + ) + return render_template('scrapydweb/listversions_error.html', **kwargs) + else: + if self.POST: + # Pass request.url instead of js['url'], for GET method + return ('REQUEST' + ' got status: %s') % (request.url, self.js['status']) + else: + alert = 'REQUEST got status: %s' % self.js['status'] + message = self.js.get('message', '') + if message: + self.js['message'] = 'See details below' + return render_template(self.template_fail, node=self.node, + alert=alert, text=self.json_dumps(self.js), message=message) + + def listprojects(self): + results = [] + for project in self.js['projects']: + url_listversions = url_for('projects', node=self.node, opt='listversions', project=project) + results.append((project, url_listversions)) + + kwargs = dict( + node=self.node, + url=self.js['url'], + node_name=self.js['node_name'], + results=results, + url_deploy=url_for('deploy', node=self.node) + ) + return render_template('scrapydweb/projects.html', **kwargs) + + def listspiders(self): + spiders = self.js['spiders'] + results = [] + for spider in spiders: + url_schedule = url_for('schedule', node=self.node, + project=self.project, version=self.version_spider_job, spider=spider) + url_multinode_schedule = url_for('servers', node=self.node, opt='schedule', + project=self.project, version_job=self.version_spider_job, spider=spider) + results.append((spider, url_schedule, url_multinode_schedule)) + + return render_template('scrapydweb/listspiders.html', node=self.node, results=results) + + def listversions(self): + results = [] + for version in self.js['versions']: + try: + version_readable = ' (%s)' % datetime.datetime.fromtimestamp(int(version)).isoformat() + except: + version_readable = '' + + url_listspiders = url_for('projects', node=self.node, opt='listspiders', project=self.project, + version_spider_job=version) + url_multinode_delversion = url_for('servers', node=self.node, opt='delversion', project=self.project, + version_job=version) + url_delversion = url_for('projects', node=self.node, opt='delversion', project=self.project, + version_spider_job=version) + results.append((version, version_readable, url_listspiders, url_multinode_delversion, url_delversion)) + + kwargs = dict( + node=self.node, + project=self.project, + results=results, + url_multinode_delproject=url_for('servers', node=self.node, opt='delproject', project=self.project), + url_delproject=url_for('projects', node=self.node, opt='delproject', project=self.project) + ) + return render_template('scrapydweb/listversions.html', **kwargs) diff --git a/views/index.py b/views/index.py new file mode 100644 index 00000000..17d06972 --- /dev/null +++ b/views/index.py @@ -0,0 +1,22 @@ +# coding: utf-8 +from flask import redirect, url_for + +from .baseview import BaseView + + +class IndexView(BaseView): + + def __init__(self): + super(IndexView, self).__init__() + + def dispatch_request(self, **kwargs): + if self.SCRAPYD_SERVERS_AMOUNT == 1: + if self.IS_MOBILE and not self.IS_IPAD: + return redirect(url_for('jobs', node=self.node, ui='mobile')) + else: + return redirect(url_for('jobs', node=self.node, ui=self.UI)) + else: + if self.USE_MOBILEUI or (self.IS_MOBILE and not self.IS_IPAD): + return redirect(url_for('jobs', node=self.node, ui='mobile')) + else: + return redirect(url_for('servers', node=self.node, ui=self.UI)) diff --git a/views/operations/__init__.py b/views/operations/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/views/operations/deploy.py b/views/operations/deploy.py new file mode 100644 index 00000000..7a560009 --- /dev/null +++ b/views/operations/deploy.py @@ -0,0 +1,467 @@ +# coding: utf-8 +from datetime import datetime +import glob +import io +import os +from pprint import pformat +import re +from shutil import copyfile, copyfileobj, rmtree +from subprocess import CalledProcessError +import tarfile +import tempfile +import time +import zipfile + +from flask import flash, redirect, render_template, request, url_for +from six import text_type +from six.moves.configparser import Error as ScrapyCfgParseError +from werkzeug.utils import secure_filename + +from ...vars import PY2 +from ..baseview import BaseView +from .scrapyd_deploy import _build_egg, get_config +from .utils import mkdir_p, slot + + +SCRAPY_CFG = """ +[settings] +default = projectname.settings + +[deploy] +url = http://localhost:6800/ +project = projectname + +""" +folder_project_dict = {} + + +class DeployView(BaseView): + + def __init__(self): + super(DeployView, self).__init__() + + self.url = 'http://{}/{}.json'.format(self.SCRAPYD_SERVER, 'addversion') + self.template = 'scrapydweb/deploy.html' + + self.scrapy_cfg_list = [] + self.project_paths = [] + self.folders = [] + self.projects = [] + self.modification_times = [] + self.latest_folder = '' + + def dispatch_request(self, **kwargs): + self.set_scrapy_cfg_list() + self.project_paths = [os.path.dirname(i) for i in self.scrapy_cfg_list] + self.folders = [os.path.basename(i) for i in self.project_paths] + self.get_modification_times() + self.parse_scrapy_cfg() + + kwargs = dict( + node=self.node, + url=self.url, + url_projects=url_for('projects', node=self.node), + selected_nodes=self.get_selected_nodes(), + folders=self.folders, + projects=self.projects, + modification_times=self.modification_times, + latest_folder=self.latest_folder, + SCRAPY_PROJECTS_DIR=self.SCRAPY_PROJECTS_DIR.replace('\\', '/'), + url_servers=url_for('servers', node=self.node, opt='deploy'), + url_deploy_upload=url_for('deploy.upload', node=self.node) + ) + return render_template(self.template, **kwargs) + + def set_scrapy_cfg_list(self): + # Python 'ascii' codec can't decode byte + try: + self.scrapy_cfg_list = glob.glob(os.path.join(self.SCRAPY_PROJECTS_DIR, '*', u'scrapy.cfg')) + except UnicodeDecodeError: + if PY2: + for name in os.listdir(os.path.join(self.SCRAPY_PROJECTS_DIR, u'')): + if not isinstance(name, text_type): + msg = "Ignore non-unicode filename %s in %s" % (repr(name), self.SCRAPY_PROJECTS_DIR) + self.logger.error(msg) + flash(msg, self.WARN) + else: + scrapy_cfg = os.path.join(self.SCRAPY_PROJECTS_DIR, name, u'scrapy.cfg') + if os.path.exists(scrapy_cfg): + self.scrapy_cfg_list.append(scrapy_cfg) + else: + raise + # '/home/username/Downloads/scrapydweb/scrapydweb/data/demo_projects/\udc8b\udc8billegal/scrapy.cfg' + # UnicodeEncodeError: 'utf-8' codec can't encode characters in position 64-65: surrogates not allowed + new_scrapy_cfg_list = [] + for scrapy_cfg in self.scrapy_cfg_list: + try: + scrapy_cfg.encode('utf-8') + except UnicodeEncodeError: + msg = "Ignore scrapy.cfg in illegal pathname %s" % repr(os.path.dirname(scrapy_cfg)) + self.logger.error(msg) + flash(msg, self.WARN) + else: + new_scrapy_cfg_list.append(scrapy_cfg) + self.scrapy_cfg_list = new_scrapy_cfg_list + + self.scrapy_cfg_list.sort(key=lambda x: x.lower()) + + def get_modification_times(self): + timestamps = [self.get_modification_time(path) for path in self.project_paths] + self.modification_times = [datetime.fromtimestamp(ts).strftime('%Y-%m-%dT%H_%M_%S') for ts in timestamps] + + if timestamps: + max_timestamp_index = timestamps.index(max(timestamps)) + self.latest_folder = self.folders[max_timestamp_index] + self.logger.debug('latest_folder: %s', self.latest_folder) + + def get_modification_time(self, path, func_walk=os.walk, retry=True): + # https://stackoverflow.com/a/29685234/10517783 + # https://stackoverflow.com/a/13454267/10517783 + filepath_list = [] + in_top_dir = True + try: + for dirpath, dirnames, filenames in func_walk(path): + if in_top_dir: + in_top_dir = False + dirnames[:] = [d for d in dirnames if d not in ['build', 'project.egg-info']] + filenames = [f for f in filenames + if not (f.endswith('.egg') or f in ['setup.py', 'setup_backup.py'])] + for filename in filenames: + filepath_list.append(os.path.join(dirpath, filename)) + except UnicodeDecodeError: + msg = "Found illegal filenames in %s" % path + self.logger.error(msg) + flash(msg, self.WARN) + if PY2 and retry: + return self.get_modification_time(path, func_walk=self.safe_walk, retry=False) + else: + raise + else: + return max([os.path.getmtime(f) for f in filepath_list] or [time.time()]) + + def parse_scrapy_cfg(self): + for (idx, scrapy_cfg) in enumerate(self.scrapy_cfg_list): + folder = self.folders[idx] + key = '%s (%s)' % (folder, self.modification_times[idx]) + + project = folder_project_dict.get(key, '') + if project: + self.projects.append(project) + self.logger.debug('Hit %s, project %s', key, project) + continue + else: + project = folder + try: + # lib/configparser.py: def get(self, section, option, *, raw=False, vars=None, fallback=_UNSET): + # projectname/scrapy.cfg: [deploy] project = demo + # PY2: get() got an unexpected keyword argument 'fallback' + # project = get_config(scrapy_cfg).get('deploy', 'project', fallback=folder) or folder + project = get_config(scrapy_cfg).get('deploy', 'project') + except ScrapyCfgParseError as err: + self.logger.error("%s parse error: %s", scrapy_cfg, err) + finally: + project = project or folder + self.projects.append(project) + folder_project_dict[key] = project + self.logger.debug('Add %s, project %s', key, project) + + keys_all = list(folder_project_dict.keys()) + keys_exist = ['%s (%s)' % (_folder, _modification_time) + for (_folder, _modification_time) in zip(self.folders, self.modification_times)] + diff = set(keys_all).difference(set(keys_exist)) + for key in diff: + self.logger.debug('Pop %s, project %s', key, folder_project_dict.pop(key)) + self.logger.debug(self.json_dumps(folder_project_dict)) + self.logger.debug('folder_project_dict length: %s', len(folder_project_dict)) + + +class DeployUploadView(BaseView): + methods = ['POST'] + + def __init__(self): + super(DeployUploadView, self).__init__() + + self.url = '' + self.template = 'scrapydweb/deploy_results.html' + + self.folder = '' + self.project = '' + self.version = '' + self.selected_nodes_amount = 0 + self.selected_nodes = [] + self.first_selected_node = 0 + + self.eggname = '' + self.eggpath = '' + self.scrapy_cfg_path = '' + self.scrapy_cfg_searched_paths = [] + self.scrapy_cfg_not_found = False + self.scrapy_cfg_parse_error = '' + self.build_egg_subprocess_error = '' + self.data = None + self.js = {} + + self.slot = slot + + def dispatch_request(self, **kwargs): + self.handle_form() + + if self.scrapy_cfg_not_found or self.scrapy_cfg_parse_error or self.build_egg_subprocess_error: + if self.selected_nodes_amount > 1: + alert = "Multinode deployment terminated:" + else: + alert = "Fail to deploy project:" + + if self.scrapy_cfg_not_found: + text = "scrapy.cfg not found" + tip = "Make sure that the 'scrapy.cfg' file resides in your project directory. " + elif self.scrapy_cfg_parse_error: + text = self.scrapy_cfg_parse_error + tip = "Check the content of the 'scrapy.cfg' file in your project directory. " + else: + text = self.build_egg_subprocess_error + tip = ("Check the content of the 'scrapy.cfg' file in your project directory. " + "Or build the egg file by yourself instead. ") + + if self.scrapy_cfg_not_found: + # Handle case when scrapy.cfg not found in zip file which contains illegal pathnames in PY3 + message = "scrapy_cfg_searched_paths:\n%s" % pformat(self.scrapy_cfg_searched_paths) + else: + message = "# The 'scrapy.cfg' file in your project directory should be like:\n%s" % SCRAPY_CFG + + return render_template(self.template_fail, node=self.node, + alert=alert, text=text, tip=tip, message=message) + else: + self.prepare_data() + status_code, self.js = self.make_request(self.url, data=self.data, auth=self.AUTH) + + if self.js['status'] != self.OK: + # With multinodes, would try to deploy to the first selected node first + if self.selected_nodes_amount > 1: + alert = ("Multinode deployment terminated, " + "since the first selected node returned status: " + self.js['status']) + else: + alert = "Fail to deploy project, got status: " + self.js['status'] + message = self.js.get('message', '') + if message: + self.js['message'] = 'See details below' + + return render_template(self.template_fail, node=self.node, + alert=alert, text=self.json_dumps(self.js), message=message) + else: + if self.selected_nodes_amount == 0: + return redirect(url_for('schedule', node=self.node, + project=self.project, version=self.version)) + else: + kwargs = dict( + node=self.node, + selected_nodes=self.selected_nodes, + first_selected_node=self.first_selected_node, + js=self.js, + project=self.project, + version=self.version, + url_projects_first_selected_node=url_for('projects', node=self.first_selected_node), + url_projects_list=[url_for('projects', node=n) for n in range(1, self.SCRAPYD_SERVERS_AMOUNT + 1)], + url_xhr=url_for('deploy.xhr', node=self.node, eggname=self.eggname, + project=self.project, version=self.version), + url_schedule=url_for('schedule', node=self.node, project=self.project, + version=self.version), + url_servers=url_for('servers', node=self.node, opt='schedule', project=self.project, + version_job=self.version) + ) + return render_template(self.template, **kwargs) + + def handle_form(self): + # {'1': 'on', + # '2': 'on', + # 'checked_amount': '2', + # 'folder': 'ScrapydWeb_demo', + # 'project': 'demo', + # 'version': '2018-09-05T03_13_50'} + + # With multinodes, would try to deploy to the first selected node first + self.selected_nodes_amount = request.form.get('checked_amount', default=0, type=int) + if self.selected_nodes_amount: + self.selected_nodes = self.get_selected_nodes() + self.first_selected_node = self.selected_nodes[0] + self.url = 'http://{}/{}.json'.format(self.SCRAPYD_SERVERS[self.first_selected_node - 1], 'addversion') + # Note that self.first_selected_node != self.node + self.AUTH = self.SCRAPYD_SERVERS_AUTHS[self.first_selected_node - 1] + else: + self.url = 'http://{}/{}.json'.format(self.SCRAPYD_SERVER, 'addversion') + + # Error: Project names must begin with a letter and contain only letters, numbers and underscores + self.project = re.sub(self.STRICT_NAME_PATTERN, '_', request.form.get('project', '')) or self.get_now_string() + self.version = re.sub(self.LEGAL_NAME_PATTERN, '-', request.form.get('version', '')) or self.get_now_string() + + if request.files.get('file'): + self.handle_uploaded_file() + else: + self.folder = request.form['folder'] # Used with SCRAPY_PROJECTS_DIR to get project_path + self.handle_local_project() + + def handle_local_project(self): + # Use folder instead of project + project_path = os.path.join(self.SCRAPY_PROJECTS_DIR, self.folder) + + self.search_scrapy_cfg_path(project_path) + if not self.scrapy_cfg_path: + self.scrapy_cfg_not_found = True + return + + self.eggname = '%s_%s.egg' % (self.project, self.version) + self.eggpath = os.path.join(self.DEPLOY_PATH, self.eggname) + self.build_egg() + + def handle_uploaded_file(self): + # http://flask.pocoo.org/docs/1.0/api/#flask.Request.form + # + file = request.files['file'] + + # Non-ASCII would be omitted and resulting the filename as to 'egg' or 'tar.gz' + filename = secure_filename(file.filename) + # tar.xz only works on Linux and macOS + if filename in ['egg', 'zip', 'tar.gz']: + filename = '%s_%s.%s' % (self.project, self.version, filename) + else: + filename = '%s_%s_from_file_%s' % (self.project, self.version, filename) + + if filename.endswith('egg'): + self.eggname = filename + self.eggpath = os.path.join(self.DEPLOY_PATH, self.eggname) + file.save(self.eggpath) + self.scrapy_cfg_not_found = False + else: # Compressed file + filepath = os.path.join(self.DEPLOY_PATH, filename) + file.save(filepath) + tmpdir = self.uncompress_to_tmpdir(filepath) + + # Search from the root of tmpdir + self.search_scrapy_cfg_path(tmpdir) + if not self.scrapy_cfg_path: + self.scrapy_cfg_not_found = True + return + + self.eggname = re.sub(r'(\.zip|\.tar\.gz)$', '.egg', filename) + self.eggpath = os.path.join(self.DEPLOY_PATH, self.eggname) + self.build_egg() + + # https://gangmax.me/blog/2011/09/17/12-14-52-publish-532/ + # https://stackoverflow.com/a/49649784 + # When ScrapydWeb runs in Linux/macOS and tries to uncompress zip file from Windows_CN_cp936 + # UnicodeEncodeError: 'ascii' codec can't encode characters in position 7-8: ordinal not in range(128) + # macOS + PY2 would raise OSError: Illegal byte sequence + # Ubuntu + PY2 would raise UnicodeDecodeError in search_scrapy_cfg_path() though f.extractall(tmpdir) works well + def uncompress_to_tmpdir(self, filepath): + self.logger.debug("Uncompressing %s", filepath) + tmpdir = tempfile.mkdtemp(prefix="scrapydweb-uncompress-") + if zipfile.is_zipfile(filepath): + with zipfile.ZipFile(filepath, 'r') as f: + if PY2: + tmpdir = tempfile.mkdtemp(prefix="scrapydweb-uncompress-") + for filename in f.namelist(): + try: + filename_utf8 = filename.decode('gbk').encode('utf8') + except (UnicodeDecodeError, UnicodeEncodeError): + filename_utf8 = filename + filepath_utf8 = os.path.join(tmpdir, filename_utf8) + + try: + with io.open(filepath_utf8, 'wb') as f_utf8: + copyfileobj(f.open(filename), f_utf8) + except IOError: + # os.mkdir(filepath_utf8) + # zipfile from Windows "send to zipped" would meet the inner folder first: + # temp\\scrapydweb-uncompress-qrcyc0\\demo7/demo/' + mkdir_p(filepath_utf8) + else: + f.extractall(tmpdir) + else: # tar.gz + with tarfile.open(filepath, 'r') as tar: # Open for reading with transparent compression (recommended). + tar.extractall(tmpdir) + + self.logger.debug("Uncompressed to %s", tmpdir) + # In case uploading a compressed file in which scrapy_cfg_dir contains none ascii in python 2, + # whereas selecting a project for auto packaging, scrapy_cfg_dir is unicode + # print(repr(tmpdir)) + # print(type(tmpdir)) + return tmpdir.decode('utf8') if PY2 else tmpdir + + def search_scrapy_cfg_path(self, search_path, func_walk=os.walk, retry=True): + try: + for dirpath, dirnames, filenames in func_walk(search_path): + self.scrapy_cfg_searched_paths.append(os.path.abspath(dirpath)) + self.scrapy_cfg_path = os.path.abspath(os.path.join(dirpath, 'scrapy.cfg')) + if os.path.exists(self.scrapy_cfg_path): + self.logger.debug("scrapy_cfg_path: %s", self.scrapy_cfg_path) + return + except UnicodeDecodeError: + msg = "Found illegal filenames in %s" % search_path + self.logger.error(msg) + flash(msg, self.WARN) + if PY2 and retry: + self.search_scrapy_cfg_path(search_path, func_walk=self.safe_walk, retry=False) + else: + raise + else: + self.logger.error("scrapy.cfg not found in: %s", search_path) + self.scrapy_cfg_path = '' + + def build_egg(self): + try: + egg, tmpdir = _build_egg(self.scrapy_cfg_path) + except ScrapyCfgParseError as err: + self.logger.error(err) + self.scrapy_cfg_parse_error = err + return + except CalledProcessError as err: + self.logger.error(err) + self.build_egg_subprocess_error = err + return + + scrapy_cfg_dir = os.path.dirname(self.scrapy_cfg_path) + copyfile(egg, os.path.join(scrapy_cfg_dir, self.eggname)) + copyfile(egg, self.eggpath) + rmtree(tmpdir) + self.logger.debug("Egg file saved to: %s", self.eggpath) + + def prepare_data(self): + with io.open(self.eggpath, 'rb') as f: + content = f.read() + self.data = { + 'project': self.project, + 'version': self.version, + 'egg': content + } + + self.slot.add_egg(self.eggname, content) + + +class DeployXhrView(BaseView): + + def __init__(self): + super(DeployXhrView, self).__init__() + + self.eggname = self.view_args['eggname'] + self.project = self.view_args['project'] + self.version = self.view_args['version'] + + self.url = 'http://{}/{}.json'.format(self.SCRAPYD_SERVER, 'addversion') + + self.slot = slot + + def dispatch_request(self, **kwargs): + content = self.slot.egg.get(self.eggname) + # content = None # For test only + if not content: + eggpath = os.path.join(self.DEPLOY_PATH, self.eggname) + with io.open(eggpath, 'rb') as f: + content = f.read() + + data = { + 'project': self.project, + 'version': self.version, + 'egg': content + } + status_code, js = self.make_request(self.url, data=data, auth=self.AUTH) + return self.json_dumps(js, as_response=True) diff --git a/views/operations/execute_task.py b/views/operations/execute_task.py new file mode 100644 index 00000000..b357cb30 --- /dev/null +++ b/views/operations/execute_task.py @@ -0,0 +1,172 @@ +# coding: utf-8 +import json +import logging +import re +import time +import traceback + +from ...common import get_now_string, get_response_from_view, handle_metadata +from ...models import Task, TaskResult, TaskJobResult, db +from ...utils.scheduler import scheduler + + +apscheduler_logger = logging.getLogger('apscheduler') + +REPLACE_URL_NODE_PATTERN = re.compile(r'^/(\d+)/') +EXTRACT_URL_SERVER_PATTERN = re.compile(r'//(.+?:\d+)') + + +class TaskExecutor(object): + + def __init__(self, task_id, task_name, url_scrapydweb, url_schedule_task, url_delete_task_result, + auth, selected_nodes): + self.task_id = task_id + self.task_name = task_name + self.url_scrapydweb = url_scrapydweb + self.url_schedule_task = url_schedule_task + self.url_delete_task_result = url_delete_task_result + self.auth = auth + self.data = dict( + task_id=task_id, + jobid='task_%s_%s' % (task_id, get_now_string(allow_space=False)) + ) + self.selected_nodes = selected_nodes + self.task_result_id = None # Be set in get_task_result_id() + self.pass_count = 0 + self.fail_count = 0 + + self.sleep_seconds_before_retry = 3 + self.nodes_to_retry = [] + self.logger = logging.getLogger(self.__class__.__name__) + + def main(self): + self.get_task_result_id() + for index, nodes in enumerate([self.selected_nodes, self.nodes_to_retry]): + if not nodes: + continue + if index == 1: + # https://apscheduler.readthedocs.io/en/latest/userguide.html#shutting-down-the-scheduler + self.logger.warning("Retry task #%s (%s) on nodes %s in %s seconds", + self.task_id, self.task_name, nodes, self.sleep_seconds_before_retry) + time.sleep(self.sleep_seconds_before_retry) + self.logger.warning("Retrying task #%s (%s) on nodes %s", self.task_id, self.task_name, nodes) + for node in nodes: + result = self.schedule_task(node) + if result: + if result['status'] == 'ok': + self.pass_count += 1 + else: + self.fail_count += 1 + self.db_insert_task_job_result(result) + self.db_update_task_result() + + def get_task_result_id(self): + # SQLite objects created in a thread can only be used in that same thread + with db.app.app_context(): + task_result = TaskResult() + task_result.task_id = self.task_id + db.session.add(task_result) + # db.session.flush() # Get task_result.id before committing, flush() is part of commit() + db.session.commit() + # If directly use task_result.id later: Instance is not bound to a Session + self.task_result_id = task_result.id + self.logger.debug("Get new task_result_id %s for task #%s", self.task_result_id, self.task_id) + + def schedule_task(self, node): + # TODO: Application was not able to create a URL adapter for request independent URL generation. + # You might be able to fix this by setting the SERVER_NAME config variable. + # with app.app_context(): + # url_schedule_task = url_for('schedule.task', node=node) + # http://127.0.0.1:5000/1/schedule/task/ + # /1/schedule/task/ + url_schedule_task = re.sub(REPLACE_URL_NODE_PATTERN, r'/%s/' % node, self.url_schedule_task) + js = {} + try: + # assert '/1/' not in url_schedule_task, u"'故意出错'\r\n\"出错\"'故意出错'\r\n\"出错\"" + # assert False + # time.sleep(10) + js = get_response_from_view(url_schedule_task, auth=self.auth, data=self.data, as_json=True) + assert js['status_code'] == 200 and js['status'] == 'ok', "Request got %s" % js + except Exception as err: + if node not in self.nodes_to_retry: + apscheduler_logger.warning("Fail to execute task #%s (%s) on node %s, would retry later: %s", + self.task_id, self.task_name, node, err) + self.nodes_to_retry.append(node) + return {} + else: + apscheduler_logger.error("Fail to execute task #%s (%s) on node %s, no more retries: %s", + self.task_id, self.task_name, node, traceback.format_exc()) + js.setdefault('url', self.url_scrapydweb) # '127.0.0.1:5000' + js.setdefault('status_code', -1) + js.setdefault('status', 'exception') + js.setdefault('exception', traceback.format_exc()) + js.update(node=node) + return js + + def db_insert_task_job_result(self, js): + with db.app.app_context(): + if not TaskResult.query.get(self.task_result_id): + apscheduler_logger.error("task_result #%s of task #%s not found", self.task_result_id, self.task_id) + apscheduler_logger.warning("Discard task_job_result of task_result #%s of task #%s: %s", + self.task_result_id, self.task_id, js) + return + task_job_result = TaskJobResult() + task_job_result.task_result_id = self.task_result_id + task_job_result.node = js['node'] + task_job_result.server = re.search(EXTRACT_URL_SERVER_PATTERN, js['url']).group(1) # '127.0.0.1:6800' + task_job_result.status_code = js['status_code'] + task_job_result.status = js['status'] + task_job_result.result = js.get('jobid', '') or js.get('message', '') or js.get('exception', '') + db.session.add(task_job_result) + db.session.commit() + self.logger.info("Inserted task_job_result: %s", task_job_result) + + # https://stackoverflow.com/questions/13895176/sqlalchemy-and-sqlite-database-is-locked + def db_update_task_result(self): + with db.app.app_context(): + task = Task.query.get(self.task_id) + task_result = TaskResult.query.get(self.task_result_id) + if not task: + apscheduler_logger.error("Task #%s not found", self.task_id) + # if task_result: + # '/1/tasks/xhr/delete/1/1/' + url_delete_task_result = re.sub(r'/\d+/\d+/$', '/%s/%s/' % (self.task_id, self.task_result_id), + self.url_delete_task_result) + js = get_response_from_view(url_delete_task_result, auth=self.auth, data=self.data, as_json=True) + apscheduler_logger.warning("Deleted task_result #%s [FAIL %s, PASS %s] of task #%s: %s", + self.task_result_id, self.fail_count, self.pass_count, self.task_id, js) + return + if not task_result: + apscheduler_logger.error("task_result #%s of task #%s not found", self.task_result_id, self.task_id) + apscheduler_logger.warning("Failed to update task_result #%s [FAIL %s, PASS %s] of task #%s", + self.task_result_id, self.fail_count, self.pass_count, self.task_id) + return + task_result.fail_count = self.fail_count + task_result.pass_count = self.pass_count + db.session.commit() + self.logger.info("Inserted task_result: %s", task_result) + + +def execute_task(task_id): + with db.app.app_context(): + task = Task.query.get(task_id) + apscheduler_job = scheduler.get_job(str(task_id)) + if not task: + apscheduler_job.remove() + apscheduler_logger.error("apscheduler_job #{id} removed since task #{id} not exist. ".format(id=task_id)) + else: + metadata = handle_metadata() + username = metadata.get('username', '') + password = metadata.get('password', '') + url_delete_task_result = metadata.get('url_delete_task_result', '/1/tasks/xhr/delete/1/1/') + task_executor = TaskExecutor(task_id=task_id, + task_name=task.name, + url_scrapydweb=metadata.get('url_scrapydweb', 'http://127.0.0.1:5000'), + url_schedule_task=metadata.get('url_schedule_task', '/1/schedule/task/'), + url_delete_task_result=url_delete_task_result, + auth=(username, password) if username and password else None, + selected_nodes=json.loads(task.selected_nodes)) + try: + task_executor.main() + except Exception: + apscheduler_logger.error(traceback.format_exc()) diff --git a/views/operations/schedule.py b/views/operations/schedule.py new file mode 100644 index 00000000..d0261a54 --- /dev/null +++ b/views/operations/schedule.py @@ -0,0 +1,642 @@ +# coding: utf-8 +from collections import OrderedDict +from datetime import datetime +import io +import json +import logging +from math import ceil +import os +import pickle +import re +import traceback + +from flask import Blueprint, redirect, render_template, request, send_file, url_for + +from ...models import Task, db +from ...vars import RUN_SPIDER_HISTORY_LOG, UA_DICT +from ..baseview import BaseView +from .execute_task import execute_task +from .utils import slot + + +apscheduler_logger = logging.getLogger('apscheduler') + + +def generate_cmd(auth, url, data): + if auth: + cmd = 'curl -u %s:%s %s' % (auth[0], auth[1], url) + else: + cmd = 'curl %s' % url + + for key, value in data.items(): + if key == 'setting': + for v in value: + t = (tuple(v.split('=', 1))) + if v.startswith('USER_AGENT='): + cmd += ' --data-urlencode "setting=%s=%s"' % t + else: + cmd += ' -d setting=%s=%s' % t + elif key != '__task_data': + cmd += ' -d %s=%s' % (key, value) + + return cmd + + +bp = Blueprint('schedule', __name__, url_prefix='/') + + +@bp.route('/schedule/history/') +def history(): + return send_file(RUN_SPIDER_HISTORY_LOG, mimetype='text/plain', cache_timeout=0) + + +class ScheduleView(BaseView): + + def __init__(self): + super(ScheduleView, self).__init__() + + self.project = self.view_args['project'] + self.version = self.view_args['version'] + self.spider = self.view_args['spider'] + self.task_id = request.args.get('task_id', default=None, type=int) + self.task = None + + self.url = 'http://%s/schedule.json' % self.SCRAPYD_SERVER + self.template = 'scrapydweb/schedule.html' + self.kwargs = {} + + self.selected_nodes = [] + self.first_selected_node = None + + def dispatch_request(self, **kwargs): + if self.task_id: + self.task = Task.query.get(self.task_id) + if not self.task: + message = "Task #%s not found" % self.task_id + self.logger.error(message) + return render_template(self.template_fail, node=self.node, message=message) + self.query_task() + elif self.POST: + self.selected_nodes = self.get_selected_nodes() + self.first_selected_node = self.selected_nodes[0] + else: + if self.project: + # START button of Jobs page / Run Spider button of Logs page + self.selected_nodes = [self.node] + else: + self.selected_nodes = [] + self.first_selected_node = self.node + + self.update_kwargs() + return render_template(self.template, **self.kwargs) + + def query_task(self): + task = self.task + self.project = task.project + self.version = task.version + self.spider = task.spider + + self.selected_nodes = json.loads(task.selected_nodes) + self.first_selected_node = self.selected_nodes[0] + + # 'settings_arguments': {'arg1': '233', 'setting': ['CLOSESPIDER_PAGECOUNT=10',]} + settings_arguments = json.loads(task.settings_arguments) + self.kwargs['expand_settings_arguments'] = len(settings_arguments) > 1 or settings_arguments['setting'] + settings_dict = dict(s.split('=') for s in settings_arguments.pop('setting')) + arguments_dict = settings_arguments + + self.kwargs['jobid'] = task.jobid or self.get_now_string() + USER_AGENT = settings_dict.pop('USER_AGENT', '') + # Chrome|iPhone|iPad|Android + self.kwargs['USER_AGENT'] = dict((v, k) for k, v in UA_DICT.items()).get(USER_AGENT, '') + for k in ['ROBOTSTXT_OBEY', 'COOKIES_ENABLED']: + v = settings_dict.pop(k, '') + self.kwargs[k] = v if v in ['True', 'False'] else '' + self.kwargs['CONCURRENT_REQUESTS'] = settings_dict.pop('CONCURRENT_REQUESTS', '') + self.kwargs['DOWNLOAD_DELAY'] = settings_dict.pop('DOWNLOAD_DELAY', '') + # "-d setting=CLOSESPIDER_TIMEOUT=60\r\n-d setting=CLOSESPIDER_PAGECOUNT=10\r\n-d arg1=val1" + additional = '' + # Use sorted() for Python 2 + for k, v in sorted(settings_dict.items()): + additional += "-d setting=%s=%s\r\n" % (k, v) + for k, v in sorted(arguments_dict.items()): + additional += "-d %s=%s\r\n" % (k, v) + # print(repr(additional)) + self.kwargs['additional'] = additional + + self.kwargs['expand_timer_task'] = True + self.kwargs['task_id'] = self.task_id + self.kwargs['name'] = task.name or 'task #%s' % self.task_id + if not self.kwargs['name'].endswith(' - edit'): + self.kwargs['name'] += ' - edit' + + self.kwargs['year'] = task.year or '*' + self.kwargs['month'] = task.month or '*' + self.kwargs['day'] = task.day or '*' + self.kwargs['week'] = task.week or '*' + # To avoid SyntaxError in javascript with Python 2: day_of_week: [u'*'], + self.kwargs['day_of_week'] = [str(s.strip()) for s in task.day_of_week.split(',')] or ['*'] # 'mon-fri,sun' + self.kwargs['hour'] = task.hour or '*' + self.kwargs['minute'] = task.minute or '0' + self.kwargs['second'] = task.second or '0' + + self.kwargs['start_date'] = task.start_date or '' + self.kwargs['end_date'] = task.end_date or '' + + if task.timezone: # To avoid showing 'None' when editing the task + self.kwargs['timezone'] = task.timezone + self.kwargs['jitter'] = max(0, task.jitter) + self.kwargs['misfire_grace_time'] = max(0, task.misfire_grace_time) + self.kwargs['coalesce'] = task.coalesce if task.coalesce in ['True', 'False'] else 'True' + self.kwargs['max_instances'] = max(1, task.max_instances) + + def update_kwargs(self): + self.kwargs.update(dict( + node=self.node, + url=self.url, + url_deploy=url_for('deploy', node=self.node), + project=self.project, + version=self.version, + spider=self.spider, + # jobid=self.get_now_string(), + selected_nodes=self.selected_nodes, + first_selected_node=self.first_selected_node, + url_servers=url_for('servers', node=self.node, opt='schedule'), + url_schedule_run=url_for('schedule.run', node=self.node), + url_schedule_history=url_for('schedule.history'), + url_listprojects=url_for('api', node=self.node, opt='listprojects'), + url_listversions=url_for('api', node=self.node, opt='listversions', project='PROJECT_PLACEHOLDER'), + url_listspiders=url_for('api', node=self.node, opt='listspiders', project='PROJECT_PLACEHOLDER', + version_spider_job='VERSION_PLACEHOLDER'), + url_schedule_check=url_for('schedule.check', node=self.node) + )) + self.kwargs.setdefault('expand_settings_arguments', self.SCHEDULE_EXPAND_SETTINGS_ARGUMENTS) + self.kwargs.setdefault('jobid', '') + # self.kwargs.setdefault('UA_DICT', UA_DICT) + self.kwargs.setdefault('CUSTOM_USER_AGENT', self.SCHEDULE_CUSTOM_USER_AGENT) + # custom|Chrome|iPhone|iPad|Android + self.kwargs.setdefault('USER_AGENT', '' if self.SCHEDULE_USER_AGENT is None else self.SCHEDULE_USER_AGENT) + self.kwargs.setdefault('ROBOTSTXT_OBEY', '' if self.SCHEDULE_ROBOTSTXT_OBEY is None else self.SCHEDULE_ROBOTSTXT_OBEY) + self.kwargs.setdefault('COOKIES_ENABLED', '' if self.SCHEDULE_COOKIES_ENABLED is None else self.SCHEDULE_COOKIES_ENABLED) + self.kwargs.setdefault('CONCURRENT_REQUESTS', '' if self.SCHEDULE_CONCURRENT_REQUESTS is None else self.SCHEDULE_CONCURRENT_REQUESTS) + self.kwargs.setdefault('DOWNLOAD_DELAY', '' if self.SCHEDULE_DOWNLOAD_DELAY is None else self.SCHEDULE_DOWNLOAD_DELAY) + # additional = "-d setting=CLOSESPIDER_TIMEOUT=60\r\n-d setting=CLOSESPIDER_PAGECOUNT=10\r\n-d arg1=val1" + self.kwargs.setdefault('additional', self.SCHEDULE_ADDITIONAL) + + self.kwargs.setdefault('expand_timer_task', 'add_task' in request.args) # '+' button in the TimeTasks page + self.kwargs.setdefault('task_id', 0) + self.kwargs['action'] = 'add_fire' + self.kwargs['trigger'] = 'cron' + self.kwargs.setdefault('name', '') + self.kwargs['replace_existing'] = 'True' + + self.kwargs.setdefault('year', '*') + self.kwargs.setdefault('month', '*') + self.kwargs.setdefault('day', '*') + self.kwargs.setdefault('week', '*') + self.kwargs.setdefault('day_of_week', ['*']) # 'mon-fri, sun' + self.kwargs.setdefault('hour', '*') + self.kwargs.setdefault('minute', '0') + self.kwargs.setdefault('second', '0') + + self.kwargs.setdefault('start_date', '') + self.kwargs.setdefault('end_date', '') + + self.kwargs.setdefault('timezone', self.scheduler.timezone) + self.kwargs.setdefault('jitter', 0) + self.kwargs.setdefault('misfire_grace_time', 600) + self.kwargs.setdefault('coalesce', 'True') + self.kwargs.setdefault('max_instances', 1) + + +class ScheduleCheckView(BaseView): + + def __init__(self): + super(ScheduleCheckView, self).__init__() + + self.url = 'http://%s/schedule.json' % self.SCRAPYD_SERVER + self.template = 'scrapydweb/schedule.html' + + self.filename = '' + self.data = OrderedDict() + self.slot = slot + + def dispatch_request(self, **kwargs): + self.logger.debug('request.form from %s\n%s', request.url, self.json_dumps(request.form)) + self.prepare_data() + self.update_data_for_timer_task() + # self.logger.warning(self.json_dumps(self.data)) # TypeError: Object of type datetime is not JSON serializable + cmd = generate_cmd(self.AUTH, self.url, self.data) + # '-d' may be in project name, like 'ScrapydWeb-demo' + cmd = re.sub(r'(curl -u\s+.*?:.*?)\s+(http://)', r'\1 \\\r\n\2', cmd) + cmd = re.sub(r'\s+-d\s+', ' \\\r\n-d ', cmd) + cmd = re.sub(r'\s+--data-urlencode\s+', ' \\\r\n--data-urlencode ', cmd) + return self.json_dumps({'filename': self.filename, 'cmd': cmd}, as_response=True) + + def prepare_data(self): + for k, d in [('project', 'projectname'), ('_version', self.DEFAULT_LATEST_VERSION), + ('spider', 'spidername')]: + self.data[k] = request.form.get(k, d) + if self.data['_version'] == self.DEFAULT_LATEST_VERSION: + self.data.pop('_version') + + jobid = request.form.get('jobid') or self.get_now_string() + self.data['jobid'] = re.sub(self.LEGAL_NAME_PATTERN, '-', jobid) + + self.data['setting'] = [] + ua = UA_DICT.get(request.form.get('USER_AGENT', ''), '') + if ua: + self.data['setting'].append('USER_AGENT=%s' % ua) + + for key in ['ROBOTSTXT_OBEY', 'COOKIES_ENABLED', 'CONCURRENT_REQUESTS', 'DOWNLOAD_DELAY']: + value = request.form.get(key, '') + if value: + self.data['setting'].append("%s=%s" % (key, value)) + + additional = request.form.get('additional', '').strip() + if additional: + parts = [i.strip() for i in re.split(r'-d\s+', re.sub(r'[\r\n]', ' ', additional)) if i.strip()] + for part in parts: + part = re.sub(r'\s*=\s*', '=', part) + if '=' not in part: + continue + m_setting = re.match(r'setting=([A-Z_]{6,31}=.+)', part) # 'EDITOR' 'DOWNLOADER_CLIENTCONTEXTFACTORY' + if m_setting: + self.data['setting'].append(m_setting.group(1)) + continue + m_arg = re.match(r'([a-zA-Z_][0-9a-zA-Z_]*)=(.+)', part) + if m_arg and m_arg.group(1) != 'setting': + self.data[m_arg.group(1)] = m_arg.group(2) + + self.data['setting'].sort() + _version = self.data.get('_version', 'default-the-latest-version') + _filename = '{project}_{version}_{spider}'.format(project=self.data['project'], + version=_version, + spider=self.data['spider']) + self.filename = '%s.pickle' % re.sub(self.LEGAL_NAME_PATTERN, '-', _filename) + filepath = os.path.join(self.SCHEDULE_PATH, self.filename) + with io.open(filepath, 'wb') as f: + f.write(pickle.dumps(self.data)) + + self.slot.add_data(self.filename, self.data) + + def get_int_from_form(self, key, default, minimum): + value = request.form.get(key) or default + try: + return max(minimum, int(ceil(float(value)))) + except (TypeError, ValueError) as err: + self.logger.warning("%s. The value of request.form['%s'] would be set as %s", err, key, default) + return default + + def update_data_for_timer_task(self): + if not request.form.get('trigger'): + return + # In case passing '-d task_data=xxx' in the additional text box + self.data['__task_data'] = dict( + action=request.form.get('action') or 'add_fire', + task_id=request.form.get('task_id', default=0, type=int), + + # trigger=request.form.get('trigger') or 'cron', + trigger='cron', + # id = # put off in ScheduleRunView.db_insert_task() + name=request.form.get('name') or None, # (str) – the description of this job None + replace_existing=request.form.get('replace_existing', 'True') == 'True', + + year=request.form.get('year') or '*', # (int|str) – 4-digit year + month=request.form.get('month') or '*', # (int|str) – month (1-12) + day=request.form.get('day') or '*', # (int|str) – day of the (1-31) + week=request.form.get('week') or '*', # (int|str) – ISO week (1-53) + # (int|str) – number or name of weekday (0-6 or mon,tue,wed,thu,fri,sat,sun) + day_of_week=request.form.get('day_of_week') or '*', # From browser: "*"|"*,mon-fri"|"", May be '0', + hour=request.form.get('hour') or '*', # (int|str) – hour (0-23) May be '0' + minute=request.form.get('minute') or '0', # (int|str) – minute (0-59) 0 + second=request.form.get('second') or '0', # (int|str) – second (0-59) 0 + + start_date=request.form.get('start_date') or None, # (datetime|str) None + end_date=request.form.get('end_date') or None, # (datetime|str) None + # from tzlocal import get_localzone + # +8 + # -5 + timezone=request.form.get('timezone') or None, # (datetime.tzinfo|str) defaults to scheduler timezone + jitter=self.get_int_from_form('jitter', 0, minimum=0), # (int|None) + # TypeError: misfire_grace_time must be either None or a positive integer + # Passing '0' would be saved as None for positive infinity. + misfire_grace_time=self.get_int_from_form('misfire_grace_time', 600, minimum=0) or None, # (int) + coalesce=(request.form.get('coalesce') or 'True') == 'True', # (bool) + # TypeError: max_instances must be a positive integer + max_instances=self.get_int_from_form('max_instances', 1, minimum=1) # (int) + ) + + +class ScheduleRunView(BaseView): + + def __init__(self): + super(ScheduleRunView, self).__init__() + + self.url = '' + self.template = 'scrapydweb/schedule_results.html' + + self.slot = slot + self.selected_nodes_amount = 0 + self.selected_nodes = [] + self.first_selected_node = 0 + self.filename = request.form['filename'] + self.data = {} + self.task_data = {} + self.task = None + self.task_id = 0 + self._action = '' + self.to_update_task = False + self.add_task_result = False + self.add_task_flash = '' + self.add_task_error = '' + self.add_task_message = '' + self.js = {} + + def dispatch_request(self, **kwargs): + self.handle_form() + self.handle_action() + self.update_history() + return self.generate_response() + + def handle_form(self): + self.selected_nodes_amount = request.form.get('checked_amount', default=0, type=int) + # With multinodes, would try to Schedule to the first selected node first + if self.selected_nodes_amount: + self.selected_nodes = self.get_selected_nodes() + self.first_selected_node = self.selected_nodes[0] + self.url = 'http://%s/schedule.json' % self.SCRAPYD_SERVERS[self.first_selected_node - 1] + # Note that self.first_selected_node != self.node + self.AUTH = self.SCRAPYD_SERVERS_AUTHS[self.first_selected_node - 1] + else: + self.selected_nodes = [self.node] + self.url = 'http://%s/schedule.json' % self.SCRAPYD_SERVER + + # in handle_action(): self.data.pop('__task_data', {}) self.task_data.pop + self.data = self.slot.data.get(self.filename, {}) + # self.data = None # For test only + if not self.data: + filepath = os.path.join(self.SCHEDULE_PATH, self.filename) + with io.open(filepath, 'rb') as f: + self.data = pickle.loads(f.read()) + + def handle_action(self): + self.logger.debug(self.json_dumps(self.data)) + self.task_data = self.data.pop('__task_data', {}) # Now self.data is clean + self.logger.debug("task_data: %s", self.task_data) + if self.task_data: # For timer task + self._action = self.task_data.pop('action') # add|add_fire|add_pause + self.task_id = self.task_data.pop('task_id') # 0|positive int from edit button in the Timer Tasks page + self.to_update_task = self.task_data.pop('replace_existing') and self.task_id # replace_existing: bool + self.db_insert_update_task() + self.add_update_task() + else: + self._action = 'run' + status_code, self.js = self.make_request(self.url, data=self.data, auth=self.AUTH) + + # https://apscheduler.readthedocs.io/en/latest/userguide.html + # https://apscheduler.readthedocs.io/en/latest/modules/triggers/cron.html#module-apscheduler.triggers.cron + def db_insert_update_task(self): + if self.to_update_task: + self.task = Task.query.get(self.task_id) + self.logger.debug("Selected %s", self.task) + self.db_process_task() + self.task.update_time = datetime.now() + # Put off in add_update_task() + # db.session.commit() + # self.logger.debug("Updated %s", self.task) + else: + self.task = Task() + self.db_process_task() + db.session.add(self.task) + db.session.commit() + self.logger.debug("Inserted %s", self.task) + self.task_id = self.task.id + + def db_process_task(self): + data = dict(self.data) # Used in update_history() and generate_response() + + self.task.project = data.pop('project') + self.task.version = data.pop('_version', self.DEFAULT_LATEST_VERSION) + self.task.spider = data.pop('spider') + self.task.jobid = data.pop('jobid') + self.task.settings_arguments = self.json_dumps(data, sort_keys=True, indent=None) + self.task.selected_nodes = str(self.selected_nodes) + + self.task.name = self.task_data['name'] + self.task.trigger = self.task_data['trigger'] + + self.task.year = self.task_data['year'] + self.task.month = self.task_data['month'] + self.task.day = self.task_data['day'] + self.task.week = self.task_data['week'] + self.task.day_of_week = self.task_data['day_of_week'] + self.task.hour = self.task_data['hour'] + self.task.minute = self.task_data['minute'] + self.task.second = self.task_data['second'] + self.task.start_date = self.task_data['start_date'] + self.task.end_date = self.task_data['end_date'] + + self.task.timezone = self.task_data['timezone'] + self.task.jitter = self.task_data['jitter'] + self.task.misfire_grace_time = self.task_data['misfire_grace_time'] + self.task.coalesce = 'True' if self.task_data['coalesce'] else 'False' # bool True would be stored as 1 + self.task.max_instances = self.task_data['max_instances'] + + # https://apscheduler.readthedocs.io/en/latest/modules/schedulers/base.html#apscheduler.schedulers.base.BaseScheduler.add_job + def add_update_task(self): + # class apscheduler.schedulers.base.BaseScheduler + # def add_job(self, func, trigger=None, args=None, kwargs=None, id=None, name=None, + # misfire_grace_time=undefined, coalesce=undefined, max_instances=undefined, + # next_run_time=undefined, jobstore='default', executor='default', + # replace_existing=False, **trigger_args): + # TODO: hard coding of url_schedule_task + # if 'url_schedule_task' not in self.metadata: + # url_schedule_task = url_for('schedule.task', node=1) # /1/schedule/task/ + # handle_metadata('url_schedule_task', url_schedule_task) + kwargs = dict(task_id=self.task_id) + self.task_data['id'] = str(self.task_id) # TypeError: id must be a nonempty string + # apscheduler.executors.default: Job "execute_task (trigger: cron[year='*'..." executed successfully + self.task_data['name'] = self.task_data['name'] or 'task_%s' % self.task_id # To replace execute_task with name + + # next_run_time (datetime) – when to first run the job, regardless of the trigger + # (pass None to add the job as paused) + if self._action == 'add_fire': + # In case the task fires before db.session.commit() + if self.to_update_task: + self.logger.info("Task #%s would be fired right after the apscheduler_job is updated", self.task_id) + else: + self.task_data['next_run_time'] = datetime.now() # datetime.utcnow() + postfix = "Reload this page several seconds later to check out the execution result. " + elif self._action == 'add_pause': + self.task_data['next_run_time'] = None + postfix = "Click the Paused button to resume it. " + else: + postfix = "Click the Running button to pause it. " + + # https://apscheduler.readthedocs.io/en/latest/modules/schedulers/base.html#apscheduler.schedulers.base.BaseScheduler.add_job + msg = '' + try: + # assert False, u"'故意出错'\r\n\"出错\"" + job_instance = self.scheduler.add_job(func=execute_task, args=None, kwargs=kwargs, + replace_existing=True, **self.task_data) + except Exception as err: + # ValueError: Unrecognized expression "10/*" for field "second" + if self.to_update_task: + db.session.rollback() + self.logger.warning("Rollback %s", self.task) + self.add_task_result = False + self.add_task_error = str(err) + msg = traceback.format_exc() + self.logger.error(msg) + else: + # https://apscheduler.readthedocs.io/en/latest/modules/triggers/cron.html#daylight-saving-time-behavior + # either use a timezone that does not observe DST, for instance UTC + # https://www.douban.com/note/147740972/ + # F12 Date() Tue Jan 29 2019 13:30:57 GMT+0800 (China Standard Time) + # https://www.timeanddate.com/time/zones/cst + # Other time zones named CST: China Standard Time, Cuba Standard Time + if self.to_update_task: + db.session.commit() + self.logger.debug("Updated %s", self.task) + # In case the task fires before db.session.commit() + if self._action == 'add_fire': + self.logger.info("Modifying next_run_time of updated task #%s to fire it right now", self.task_id) + job_instance.modify(next_run_time=datetime.now()) + self.add_task_result = True + msg = u"{target} task #{task_id} ({task_name}) successfully, next run at {next_run_time}. ".format( + target="Update" if self.to_update_task else 'Add', + task_id=self.task_id, task_name=self.task_data['name'], + next_run_time=job_instance.next_run_time or self.NA) + self.add_task_flash = msg + postfix + apscheduler_logger.warning(msg) + # TypeError: vars() argument must have __dict__ attribute + # apscheduler_logger.warning(vars(job_instance)) + # pformat({k: getattr(job_instance, k) for k in job_instance.__slots__}, indent=4) + job_instance_dict = dict( + id=job_instance.id, + name=job_instance.name, + kwargs=job_instance.kwargs, + misfire_grace_time=job_instance.misfire_grace_time, + max_instances=job_instance.max_instances, + trigger=repr(job_instance.trigger), + next_run_time=repr(job_instance.next_run_time), + ) + apscheduler_logger.warning("%s job_instance: \n%s", "Updated" if self.to_update_task else 'Added', + self.json_dumps(job_instance_dict)) + finally: + if 'next_run_time' in self.task_data: # TypeError: Object of type datetime is not JSON serializable + self.task_data['next_run_time'] = str(self.task_data['next_run_time'] or self.NA) + self.add_task_message = (u"{msg}\nkwargs for execute_task():\n{kwargs}\n\n" + u"task_data for scheduler.add_job():\n{task_data}").format( + msg=msg, kwargs=self.json_dumps(kwargs), task_data=self.json_dumps(self.task_data)) + self.logger.debug(self.add_task_message) + + def update_history(self): + with io.open(RUN_SPIDER_HISTORY_LOG, 'r+', encoding='utf-8') as f: + content_backup = f.read() + f.seek(0) + content = os.linesep.join([ + '%s %s <%s>' % ('#' * 50, self.get_now_string(True), self._action), + str([self.SCRAPYD_SERVERS[i - 1] for i in self.selected_nodes]), + generate_cmd(self.AUTH, self.url, self.data), + self.add_task_message or self.json_dumps(self.js), + '' + ]) + f.write(content) + f.write(content_backup) + + def generate_response(self): + if self._action in ['add', 'add_fire', 'add_pause']: + if self.add_task_result: + return redirect(url_for('tasks', node=self.node, flash=self.add_task_flash)) + else: + return render_template(self.template_fail, node=self.node, + alert="Fail to add/edit task with error:", + text=self.add_task_error, + tip=("Check out the HELP section in the Run Spider page, and then " + "go back to the Timer Tasks page to re-edit task #%s. ") % self.task_id, + message=self.add_task_message) + if self.js['status'] == self.OK: + if not self.selected_nodes_amount: + return redirect(url_for('jobs', node=self.node)) + + kwargs = dict( + node=self.node, + project=self.data['project'], + version=self.data.get('_version', self.DEFAULT_LATEST_VERSION), + spider=self.data['spider'], + selected_nodes=self.selected_nodes, + first_selected_node=self.first_selected_node, + js=self.js, + url_stats_list=[url_for('log', node=node, opt='stats', project=self.data['project'], + spider=self.data['spider'], job=self.data['jobid']) + for node in range(1, self.SCRAPYD_SERVERS_AMOUNT + 1)], + url_xhr=url_for('schedule.xhr', node=self.node, filename=self.filename), + url_servers=url_for('servers', node=self.node, opt='getreports', project=self.data['project'], + spider=self.data['spider'], version_job=self.data['jobid']) + ) + return render_template(self.template, **kwargs) + else: + if self.selected_nodes_amount > 1: + alert = ("Multinode schedule terminated, " + "since the first selected node returned status: " + self.js['status']) + else: + alert = "Fail to schedule, got status: " + self.js['status'] + + message = self.js.get('message', '') + if message: + self.js['message'] = 'See details below' + + return render_template(self.template_fail, node=self.node, + alert=alert, text=self.json_dumps(self.js), message=message) + + +class ScheduleXhrView(BaseView): + + def __init__(self): + super(ScheduleXhrView, self).__init__() + + self.filename = self.view_args['filename'] + self.url = 'http://%s/schedule.json' % self.SCRAPYD_SERVER + self.slot = slot + self.data = None + + def dispatch_request(self, **kwargs): + self.data = self.slot.data.get(self.filename) + # self.data = None # For test only + if not self.data: + filepath = os.path.join(self.SCHEDULE_PATH, self.filename) + with io.open(filepath, 'rb') as f: + self.data = pickle.loads(f.read()) + + status_code, js = self.make_request(self.url, data=self.data, auth=self.AUTH) + return self.json_dumps(js, as_response=True) + + +class ScheduleTaskView(BaseView): + + def __init__(self): + super(ScheduleTaskView, self).__init__() + + self.url = 'http://%s/schedule.json' % self.SCRAPYD_SERVER + self.task_id = request.form['task_id'] + self.jobid = request.form['jobid'] + self.data = {} + + def dispatch_request(self, **kwargs): + task = Task.query.get(self.task_id) + if not task: + message = "Task #%s not found" % self.task_id + self.logger.error(message) + js = dict(url=self.url, auth=self.AUTH, status_code=-1, status=self.ERROR, message=message) + else: + self.data['project'] = task.project + if task.version != self.DEFAULT_LATEST_VERSION: + self.data['_version'] = task.version + self.data['spider'] = task.spider + self.data['jobid'] = self.jobid + self.data.update(json.loads(task.settings_arguments)) + status_code, js = self.make_request(self.url, data=self.data, auth=self.AUTH) + + return self.json_dumps(js, as_response=True) diff --git a/views/operations/scrapyd_deploy.py b/views/operations/scrapyd_deploy.py new file mode 100644 index 00000000..4285ca63 --- /dev/null +++ b/views/operations/scrapyd_deploy.py @@ -0,0 +1,86 @@ +# coding: utf-8 +""" +source: https://github.com/scrapy/scrapyd-client +scrapyd-client/scrapyd_client/deploy.py +""" +import errno +import glob +import os +from shutil import copyfile +from subprocess import check_call +import sys +import tempfile + +from flask import current_app as app +from six.moves.configparser import SafeConfigParser + + +_SETUP_PY_TEMPLATE = """# Automatically created by: scrapydweb x scrapyd-client + +from setuptools import setup, find_packages + +setup( + name = 'project', + version = '1.0', + packages = find_packages(), + entry_points = {'scrapy': ['settings = %(settings)s']}, +) +""" + + +def get_config(sources): + """Get Scrapy config file as a SafeConfigParser""" + # sources = get_sources(use_closest) + cfg = SafeConfigParser() + cfg.read(sources) + return cfg + + +def retry_on_eintr(func, *args, **kw): + """Run a function and retry it while getting EINTR errors""" + while True: + try: + return func(*args, **kw) + except IOError as e: + if e.errno != errno.EINTR: + raise + + +def _build_egg(scrapy_cfg_path): + cwd = os.getcwd() + # If get_config() raise an error without executing os.chdir(cwd), would cause subsequent test cases + # to raise TemplateNotFound when testing in Python 2 on Debian or macOS. + # Debug: add print(environment.list_templates()) in flask/templating.py _get_source_fast() would show [] + try: + os.chdir(os.path.dirname(scrapy_cfg_path)) + + if os.path.exists('setup.py'): + copyfile('setup.py', 'setup_backup.py') + # lib/configparser.py: def get(self, section, option, *, raw=False, vars=None, fallback=_UNSET): + # projectname/scrapy.cfg: [settings] default = demo.settings + settings = get_config(scrapy_cfg_path).get('settings', 'default') # demo.settings + _create_default_setup_py(settings=settings) + + d = tempfile.mkdtemp(prefix="scrapydweb-deploy-") + o = open(os.path.join(d, "stdout"), "wb") + e = open(os.path.join(d, "stderr"), "wb") + retry_on_eintr(check_call, [sys.executable, 'setup.py', 'clean', '-a', 'bdist_egg', '-d', d], + stdout=o, stderr=e) + egg = glob.glob(os.path.join(d, '*.egg'))[0] + o.close() + e.close() + except: + os.chdir(cwd) + raise + finally: + os.chdir(cwd) + + return egg, d + + +def _create_default_setup_py(**kwargs): + with open('setup.py', 'w') as f: + content = _SETUP_PY_TEMPLATE % kwargs + app.logger.debug('New setup.py') + # app.logger.debug(content) + f.write(content) diff --git a/views/operations/utils.py b/views/operations/utils.py new file mode 100644 index 00000000..beccc097 --- /dev/null +++ b/views/operations/utils.py @@ -0,0 +1,44 @@ +# coding: utf-8 +from collections import OrderedDict +import errno +import os + + +class Slot: + def __init__(self, limit_egg=10, limit_data=10): + self.limit_egg = limit_egg + self.limit_data = limit_data + self._egg = OrderedDict() + self._data = OrderedDict() + + @property + def egg(self): + return self._egg + + @property + def data(self): + return self._data + + def add_egg(self, key, value): + self._egg[key] = value + if len(self._egg) > self.limit_egg: + self._egg.popitem(last=False) + + def add_data(self, key, value): + self._data[key] = value + if len(self._data) > self.limit_data: + self._data.popitem(last=False) + + +slot = Slot() + + +# https://stackoverflow.com/a/600612/10517783 +def mkdir_p(path): + try: + os.makedirs(path) + except OSError as err: # Python >2.5 + if err.errno == errno.EEXIST and os.path.isdir(path): + pass + else: + raise diff --git a/views/overview/__init__.py b/views/overview/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/views/overview/multinode.py b/views/overview/multinode.py new file mode 100644 index 00000000..bb0eca91 --- /dev/null +++ b/views/overview/multinode.py @@ -0,0 +1,49 @@ +# coding: utf-8 +from flask import render_template, url_for + +from ..baseview import BaseView + + +class MultinodeView(BaseView): + methods = ['POST'] + + def __init__(self): + super(MultinodeView, self).__init__() + + self.opt = self.view_args['opt'] + self.project = self.view_args['project'] + self.version_job = self.view_args['version_job'] + + self.template = 'scrapydweb/multinode_results.html' + + def dispatch_request(self, **kwargs): + selected_nodes = self.get_selected_nodes() + url_xhr = url_for('api', node=selected_nodes[0], opt=self.opt, + project=self.project, version_spider_job=self.version_job) + + if self.opt == 'stop': + title = "Stop Job (%s) of Project (%s)" % (self.project, self.version_job) + url_servers = url_for('servers', node=self.node, opt='listjobs', project=self.project) + btn_servers = "Servers » List Running Jobs" + elif self.opt == 'delversion': + title = "Delete Version (%s) of Project (%s)" % (self.version_job, self.project) + url_servers = url_for('servers', node=self.node, opt='listversions', project=self.project) + btn_servers = "Servers » List Versions" + else: # elif opt == 'delproject': + title = "Delete Project (%s)" % self.project + url_servers = url_for('servers', node=self.node, opt='listprojects', project=self.project) + btn_servers = "Servers » List Projects" + + kwargs = dict( + node=self.node, + title=title, + opt=self.opt, + project=self.project, + version_job=self.version_job, + selected_nodes=selected_nodes, + url_xhr=url_xhr, + url_servers=url_servers, + btn_servers=btn_servers, + url_projects_list=[url_for('projects', node=n) for n in range(1, self.SCRAPYD_SERVERS_AMOUNT + 1)] + ) + return render_template(self.template, **kwargs) diff --git a/views/overview/servers.py b/views/overview/servers.py new file mode 100644 index 00000000..c693e929 --- /dev/null +++ b/views/overview/servers.py @@ -0,0 +1,77 @@ +# coding: utf-8 +from flask import flash, render_template, url_for + +from ...common import handle_metadata +from ..baseview import BaseView + + +metadata = dict(pageview=handle_metadata().get('pageview', 1)) + + +class ServersView(BaseView): + metadata = metadata + + def __init__(self): + super(ServersView, self).__init__() + + self.opt = self.view_args['opt'] + self.project = self.view_args['project'] + self.version_job = self.view_args['version_job'] + self.spider = self.view_args['spider'] + + self.url = 'http://%s/daemonstatus.json' % self.SCRAPYD_SERVER + self.template = 'scrapydweb/servers.html' + self.selected_nodes = [] + + def dispatch_request(self, **kwargs): + self.metadata['pageview'] += 1 + self.logger.debug('metadata: %s', self.metadata) + + if self.SCRAPYD_SERVERS_AMOUNT > 1 and not (self.metadata['pageview'] > 2 and self.metadata['pageview'] % 100): + if not self.ENABLE_AUTH: + flash("Set 'ENABLE_AUTH = True' to enable basic auth for web UI", self.INFO) + if self.IS_LOCAL_SCRAPYD_SERVER and not self.ENABLE_LOGPARSER: + flash("Set 'ENABLE_LOGPARSER = True' to run LogParser as a subprocess at startup", self.WARN) + if not self.ENABLE_MONITOR: + flash("Set 'ENABLE_MONITOR = True' to enable the monitor feature", self.INFO) + + if self.POST: + self.selected_nodes = self.get_selected_nodes() + else: + if self.SCRAPYD_SERVERS_AMOUNT == 1: + self.selected_nodes = [1] + else: + self.selected_nodes = [] + + kwargs = dict( + node=self.node, + opt=self.opt, + project=self.project, + version_job=self.version_job, + spider=self.spider, + url=self.url, + selected_nodes=self.selected_nodes, + IS_IE_EDGE=self.IS_IE_EDGE, + pageview=self.metadata['pageview'], + FEATURES=self.FEATURES, + DEFAULT_LATEST_VERSION=self.DEFAULT_LATEST_VERSION, + url_daemonstatus=url_for('api', node=self.node, opt='daemonstatus'), + url_getreports=url_for('clusterreports', node=self.node, project='PROJECT_PLACEHOLDER', + spider='SPIDER_PLACEHOLDER', job='JOB_PLACEHOLDER'), + url_liststats=url_for('api', node=self.node, opt='liststats', project='PROJECT_PLACEHOLDER', + version_spider_job='JOB_PLACEHOLDER'), + url_listprojects=url_for('api', node=self.node, opt='listprojects'), + url_listversions=url_for('api', node=self.node, opt='listversions', project='PROJECT_PLACEHOLDER'), + url_listspiders=url_for('api', node=self.node, opt='listspiders', project='PROJECT_PLACEHOLDER', + version_spider_job='VERSION_PLACEHOLDER'), + url_listjobs=url_for('api', node=self.node, opt='listjobs', project='PROJECT_PLACEHOLDER'), + url_deploy=url_for('deploy', node=self.node), + url_schedule=url_for('schedule', node=self.node, project='PROJECT_PLACEHOLDER', + version='VERSION_PLACEHOLDER', spider='SPIDER_PLACEHOLDER'), + url_stop=url_for('multinode', node=self.node, opt='stop', project='PROJECT_PLACEHOLDER', + version_job='JOB_PLACEHOLDER'), + url_delversion=url_for('multinode', node=self.node, opt='delversion', project='PROJECT_PLACEHOLDER', + version_job='VERSION_PLACEHOLDER'), + url_delproject=url_for('multinode', node=self.node, opt='delproject', project='PROJECT_PLACEHOLDER') + ) + return render_template(self.template, **kwargs) diff --git a/views/overview/tasks.py b/views/overview/tasks.py new file mode 100644 index 00000000..c9345900 --- /dev/null +++ b/views/overview/tasks.py @@ -0,0 +1,427 @@ +# coding: utf-8 +from datetime import datetime +import json +import logging +import traceback + +from flask import Blueprint, flash, render_template, request, send_file, url_for + +from ...common import handle_metadata +from ...models import Task, TaskResult, TaskJobResult, db +from ...vars import SCHEDULER_STATE_DICT, STATE_PAUSED, STATE_RUNNING, TIMER_TASKS_HISTORY_LOG +from ..baseview import BaseView + + +apscheduler_logger = logging.getLogger('apscheduler') +metadata = dict(per_page=handle_metadata().get('tasks_per_page', 100)) + +bp = Blueprint('tasks', __name__, url_prefix='/') + + +@bp.route('/tasks/history/') +def history(): + return send_file(TIMER_TASKS_HISTORY_LOG, mimetype='text/plain', cache_timeout=0) + + +# https://apscheduler.readthedocs.io/en/latest/userguide.html +# https://apscheduler.readthedocs.io/en/latest/modules/schedulers/base.html#module-apscheduler.schedulers.base +# https://apscheduler.readthedocs.io/en/latest/modules/job.html#apscheduler.job.Job +class TasksView(BaseView): + metadata = metadata + + def __init__(self): + super(TasksView, self).__init__() + + self.task_id = self.view_args['task_id'] # , 0 ok, -1 fail + self.task_result_id = self.view_args['task_result_id'] # + + self.flash = request.args.get('flash', '') + self.per_page = request.args.get('per_page', default=self.metadata['per_page'], type=int) + if self.per_page != self.metadata['per_page']: + self.metadata['per_page'] = self.per_page + handle_metadata('tasks_per_page', self.per_page) + self.logger.debug("Change per_page to %s", self.metadata['per_page']) + self.page = request.args.get('page', default=1, type=int) + + # If self.task is defined before handle_metadata('tasks_per_page', self.per_page) + # Instance is not bound to a Session; + # attribute refresh operation cannot proceed + # (Background on this error at: http://sqlalche.me/e/bhk3) + self.task = Task.query.get(self.task_id) if self.task_id else None + self.kwargs = {} + self.template = '' + + def dispatch_request(self, **kwargs): + if self.flash: + flash(self.flash, self.INFO) + + # http://flask-sqlalchemy.pocoo.org/2.3/queries/#queries-in-views + # Use get_or_404 to ensure that url_for the Stats page works + # task = Task.query.get_or_404(self.task_id) + if self.task_id and not self.task: + message = "Task #%s not found" % self.task_id + self.logger.error(message) + return render_template(self.template_fail, node=self.node, message=message) + + if self.task_id and self.task_result_id: + self.template = 'scrapydweb/task_job_results.html' + self.query_task_job_results() + elif self.task_id: + self.template = 'scrapydweb/task_results.html' + self.query_task_results() + else: + self.template = 'scrapydweb/tasks.html' + self.query_tasks() + return render_template(self.template, **self.kwargs) + + def query_tasks(self): + if self.scheduler.state == STATE_PAUSED: + flash("Click the DISABLED button to enable the scheduler for timer tasks. ", self.WARN) + self.remove_apscheduler_job_without_task() + + # https://stackoverflow.com/questions/43103585/python-flask-sqlalchemy-pagination + # https://blog.miguelgrinberg.com/post/the-flask-mega-tutorial-part-ix-pagination + # http://flask-sqlalchemy.pocoo.org/2.3/api/#flask_sqlalchemy.BaseQuery.paginate + # paginate(page=None, per_page=None, error_out=True, max_per_page=None) + # tasks = Task.query.all() + tasks = Task.query.order_by(Task.id.desc()).paginate( + page=self.page, per_page=self.per_page, error_out=False) + self.process_tasks(tasks) + + # default-sort in Vue would cause background color of the buttons flash once + # :default-sort="{prop: 'status', order: 'descending'}" + tasks.items.sort(key=lambda task: task.status, reverse=True) + + if self.scheduler.state == STATE_RUNNING: + scheduler_action_button = 'ENABLED' + url_scheduler_action = url_for('tasks.xhr', node=self.node, action='disable') + else: + scheduler_action_button = 'DISABLED' + url_scheduler_action = url_for('tasks.xhr', node=self.node, action='enable') + + self.kwargs = dict( + node=self.node, + tasks=tasks, + url_add_task=url_for('schedule', node=self.node, add_task='True'), + scheduler_action_button=scheduler_action_button, + url_scheduler_action=url_scheduler_action, + url_tasks_history=url_for('tasks.history') + ) + + def remove_apscheduler_job_without_task(self): + # In case the task is remove from database while its apscheduler_job is still running + apscheduler_job_id_set = set([j.id for j in self.scheduler.get_jobs(jobstore='default')]) # type(j.id): str + task_id_set = set([str(t.id) for t in Task.query.all()]) # type(t.id): int + for i in apscheduler_job_id_set.difference(task_id_set): + self.scheduler.remove_job(i, jobstore='default') + msg = "apscheduler_job #{id} removed since task #{id} not exist. ".format(id=i) + apscheduler_logger.error(msg) + flash(msg, self.WARN) + + def process_tasks(self, tasks): + with db.session.no_autoflush: # To avoid in place updating + # for task in tasks: # TypeError: 'Pagination' object is not iterable # tasks.item: list + for index, task in enumerate(tasks.items, (tasks.page - 1) * tasks.per_page + 1): + # Columns: Name | Prev run result | Task results + task.index = index + task.name = task.name or '' + task.timezone = task.timezone or self.scheduler.timezone + task.create_time = self.remove_microsecond(task.create_time) + task.update_time = self.remove_microsecond(task.update_time) + task_results = TaskResult.query.filter_by(task_id=task.id).order_by(TaskResult.id.desc()) + task.run_times = task_results.count() + task.url_task_results = url_for('tasks', node=self.node, task_id=task.id) + if task.run_times > 0: + task.fail_times = sum([int(t.fail_count > 0) for t in task_results]) + latest_task_result = task_results[0] + if latest_task_result.fail_count == 0 and latest_task_result.pass_count == 1: + task_job_result = TaskJobResult.query.filter_by(task_result_id=latest_task_result.id).order_by( + TaskJobResult.id.desc()).first() + task.prev_run_result = task_job_result.result[-19:] # task_N_2019-01-01T00_00_01 + task.url_prev_run_result = url_for('log', node=task_job_result.node, opt='stats', + project=task.project, spider=task.spider, + job=task_job_result.result) + else: + # 'FAIL 0, PASS 0' if execute_task() has not finished + task.prev_run_result = 'FAIL %s, PASS %s' % (latest_task_result.fail_count, + latest_task_result.pass_count) + task.url_prev_run_result = url_for('tasks', node=self.node, + task_id=task.id, task_result_id=latest_task_result.id) + else: + task.fail_times = 0 + task.prev_run_result = self.NA + task.url_prev_run_result = task.url_task_results + # Columns: Status | Actions | Next run time + task.url_edit = url_for('schedule', node=self.node, task_id=task.id) + # PostgreSQL 8.3 removes implicit casts + # sqlalchemy.exc.ProgrammingError: (psycopg2.ProgrammingError) operator does not exist: character varying = integer + # LINE 3: WHERE apscheduler_jobs.id = 2 + # HINT: No operator matches the given name and argument types. You might need to add explicit type casts. + # [SQL: 'SELECT apscheduler_jobs.job_state \nFROM apscheduler_jobs \nWHERE apscheduler_jobs.id = %(id_1)s'] [parameters: {'id_1': 2}] + # (Background on this error at: http://sqlalche.me/e/f405) + apscheduler_job = self.scheduler.get_job(str(task.id)) # Return type: Job or None + if apscheduler_job: + self.logger.debug("apscheduler_job %s: %s", apscheduler_job.name, apscheduler_job) + if apscheduler_job.next_run_time: + task.status = 'Running' + action = 'pause' + if self.scheduler.state == STATE_PAUSED: + task.next_run_time = "Click DISABLED button first. " + else: + # TypeError: argument of type 'datetime.datetime' is not iterable + task.next_run_time = str(apscheduler_job.next_run_time) # '2019-01-01 00:00:01+08:00' + task.url_fire = url_for('tasks.xhr', node=self.node, action='fire', task_id=task.id) + else: + task.status = 'Paused' + action = 'resume' + task.next_run_time = self.NA + task.url_fire = '' + task.url_status = url_for('tasks.xhr', node=self.node, action=action, task_id=task.id) + task.action = 'Stop' + task.url_action = url_for('tasks.xhr', node=self.node, action='remove', task_id=task.id) + else: + task.status = 'Finished' + task.url_status = task.url_task_results # '', 'javascript:;' + task.action = 'Delete' + task.url_action = url_for('tasks.xhr', node=self.node, action='delete', task_id=task.id) + task.next_run_time = self.NA + task.url_fire = '' + + def query_task_results(self): + task_results = TaskResult.query.filter_by(task_id=self.task_id).order_by( + TaskResult.id.desc()).paginate(page=self.page, per_page=self.per_page, error_out=False) + # In case that execute_task() has not finished or selected_nodes is modified + with_job = all([task_result.fail_count + task_result.pass_count == 1 for task_result in task_results.items]) + + with db.session.no_autoflush: + for index, task_result in enumerate(task_results.items, + (task_results.page - 1) * task_results.per_page + 1): + task_result.index = index + if with_job: # To show task_job_result in task_results.html + self.template = 'scrapydweb/task_results_with_job.html' + task_job_result = TaskJobResult.query.filter_by(task_result_id=task_result.id).order_by( + TaskJobResult.id.desc()).first() + task_result.task_job_result_id = task_job_result.id + task_result.run_time = self.remove_microsecond(task_job_result.run_time) + task_result.node = task_job_result.node + task_result.server = task_job_result.server + task_result.status_code = task_job_result.status_code + task_result.status = task_job_result.status + task_result.result = task_job_result.result + if task_job_result.status == self.OK: + task_result.url_stats = url_for('log', node=task_job_result.node, opt='stats', + project=self.task.project, spider=self.task.spider, + job=task_job_result.result) + else: + task_result.url_stats = '' # 'javascript:;' + else: + task_result.execute_time = self.remove_microsecond(task_result.execute_time) + task_result.url_task_job_results = url_for('tasks', node=self.node, + task_id=self.task_id, task_result_id=task_result.id) + task_result.url_action = url_for('tasks.xhr', node=self.node, action='delete', + task_id=self.task.id, task_result_id=task_result.id) + + self.kwargs = dict( + node=self.node, + task_id=self.task_id, + task=self.task, + task_results=task_results, + url_tasks=url_for('tasks', node=self.node), + ) + + def query_task_job_results(self): + # https://docs.sqlalchemy.org/en/latest/core/sqlelement.html#sqlalchemy.sql.expression.asc + task_job_results = TaskJobResult.query.filter_by(task_result_id=self.task_result_id).order_by( + TaskJobResult.node.asc()).paginate(page=self.page, per_page=self.per_page, error_out=False) + with db.session.no_autoflush: + for index, task_job_result in enumerate(task_job_results.items, + (task_job_results.page - 1) * task_job_results.per_page + 1): + task_job_result.index = index + task_job_result.run_time = self.remove_microsecond(task_job_result.run_time) + if task_job_result.status == self.OK: + task_job_result.url_stats = url_for('log', node=task_job_result.node, opt='stats', + project=self.task.project, spider=self.task.spider, + job=task_job_result.result) + task_job_result.url_clusterreports = url_for('clusterreports', node=self.node, + project=self.task.project, spider=self.task.spider, + job=task_job_result.result) + else: + task_job_result.url_stats = '' # 'javascript:;' + task_job_result.url_clusterreports = '' + + self.kwargs = dict( + node=self.node, + task_id=self.task_id, + task_result_id=self.task_result_id, + task=self.task, + task_job_results=task_job_results, + url_tasks=url_for('tasks', node=self.node), + url_task_results=url_for('tasks', node=self.node, task_id=self.task_id), + ) + + +class TasksXhrView(BaseView): + + def __init__(self): + super(TasksXhrView, self).__init__() + + self.action = self.view_args['action'] # pause|resume|remove|delete|dump|fire + self.task_id = self.view_args['task_id'] # + self.task_result_id = self.view_args['task_result_id'] # + + self.task = Task.query.get(self.task_id) if self.task_id else None + self.apscheduler_job = self.scheduler.get_job(str(self.task_id)) if self.task_id else None # Return type: Job|None + self.js = dict(action=self.action, task_id=self.task_id, task_result_id=self.task_result_id, url=request.url) + + def dispatch_request(self, **kwargs): + try: + self.generate_response() + except Exception as err: + self.logger.error(traceback.format_exc()) + db.session.rollback() + self.js['status'] = 'exception' + self.js['message'] = str(err) + else: + self.js.setdefault('status', self.OK) + finally: + self.logger.debug(self.js) + return self.json_dumps(self.js, as_response=True) + + def generate_response(self): + if self.action in ['disable', 'enable']: # ENABLE|DISABLE the scheduler + self.enable_disable_scheduler() + elif self.action == 'delete': # delete a task_result|task + if self.task_result_id: + self.delete_task_result() + else: + self.delete_task() + elif self.action == 'dump': # For test only + self.dump_task_data() + elif self.action == 'fire': # update next_run_time + self.fire_task() + elif self.action == 'list': # For test only + self.list_tasks_or_results() + else: # pause|resume|remove a apscheduler_job + self.handle_apscheduler_job() + + def enable_disable_scheduler(self): + # scheduler.running: a shortcut for scheduler.state != STATE_STOPPED + # if self.scheduler.state == STATE_RUNNING: + if self.action == 'disable': + self.scheduler.pause() + else: # 'enable' + self.scheduler.resume() + handle_metadata('scheduler_state', self.scheduler.state) + self.js['tip'] = "Scheduler after '%s': %s" % (self.action, SCHEDULER_STATE_DICT[self.scheduler.state]) + + def delete_task_result(self): + task_result = TaskResult.query.get(self.task_result_id) + # In case that execute_task() has not finished + # if task_result and (task_result.pass_count or task_result.fail_count): + if task_result: + db.session.delete(task_result) + db.session.commit() + self.js['tip'] = "task_result #%s deleted. " % self.task_result_id + else: + self.js['status'] = self.ERROR + self.js['message'] = "task_result #%s not found. " % self.task_result_id + + def delete_task(self): + # Actually, the 'delete a task' button is available only when apscheduler_job is None + if request.args.get('ignore_apscheduler_job') == 'True': # For test only + self.js['tip'] = "Ignore apscheduler_job #%s. " % self.task_id + else: + if self.apscheduler_job: + self.apscheduler_job.remove() + self.js['tip'] = "apscheduler_job #%s removed. " % self.task_id + else: + self.js['tip'] = "apscheduler_job #%s not found. " % self.task_id + if self.task: + db.session.delete(self.task) + db.session.commit() + msg = "Task #%s deleted. " % self.task_id + apscheduler_logger.warning(msg) + self.js['tip'] += msg + else: + self.js['status'] = self.ERROR + self.js['message'] = self.js.pop('tip') + "Task #%s not found. " % self.task_id + + def fire_task(self): + if not self.apscheduler_job: + self.js['status'] = self.ERROR + self.js['message'] = "apscheduler_job #{0} not found, check if task #{0} is finished. ".format(self.task_id) + return + elif not self.apscheduler_job.next_run_time: + self.js['status'] = self.ERROR + self.js['message'] = "apscheduler_job #%s is paused, resume it first. " % self.task_id + return + self.apscheduler_job.modify(next_run_time=datetime.now()) + self.js['tip'] = "Reload this page several seconds later to check out the fire result. " + self.js['url_jump'] = url_for('tasks', node=self.node, task_id=self.task_id) + + def handle_apscheduler_job(self): + if not self.apscheduler_job: + self.js['status'] = self.ERROR + self.js['message'] = "apscheduler_job #%s not found. " % self.task_id + return + if self.action == 'pause': + self.apscheduler_job.pause() + elif self.action == 'resume': + self.apscheduler_job.resume() + else: # 'Stop' button to 'remove' + self.apscheduler_job.remove() + self.js['tip'] = u"apscheduler_job #{task_id} after '{action}': {apscheduler_job}".format( + task_id=self.task_id, action=self.action, apscheduler_job=self.scheduler.get_job(str(self.task_id))) + + def dump_task_data(self): + if not self.task: + self.js['status'] = self.ERROR + if self.apscheduler_job: # For test only + self.js['data'] = dict(apscheduler_job=self.task_id) + self.js['message'] = "apscheduler_job #%s found. " % self.task_id + else: + self.js['data'] = None + self.js['message'] = "apscheduler_job #%s not found. " % self.task_id + self.js['message'] += "Task #%s not found. " % self.task_id + return + # print(vars(self.task)) + self.js['data'] = dict((k, v) for k, v in vars(self.task).items() if not k.startswith('_')) + self.js['data']['settings_arguments'] = json.loads(self.js['data']['settings_arguments']) + self.js['data']['selected_nodes'] = json.loads(self.js['data']['selected_nodes']) + self.js['data']['create_time'] = str(self.js['data']['create_time']) + self.js['data']['update_time'] = str(self.js['data']['update_time']) + if not self.apscheduler_job: + self.js['data']['apscheduler_job'] = None + self.js['tip'] = "apscheduler_job #{id} not found. Task #{id} found. ".format(id=self.task_id) + return + # print(self.apscheduler_job.__slots__) + # ('_scheduler', '_jobstore_alias', 'id', 'trigger', 'executor', 'func', 'func_ref', 'args', 'kwargs', + # 'name', 'misfire_grace_time', 'coalesce', 'max_instances', 'next_run_time') + self.js['data']['apscheduler_job'] = dict( + id=self.apscheduler_job.id, + name=self.apscheduler_job.name, + kwargs=self.apscheduler_job.kwargs, + misfire_grace_time=self.apscheduler_job.misfire_grace_time, + coalesce=self.apscheduler_job.coalesce, + max_instances=self.apscheduler_job.max_instances, + next_run_time=str(self.apscheduler_job.next_run_time) if self.apscheduler_job.next_run_time else None, + ) + self.js['data']['apscheduler_job']['trigger'] = dict((f.name, str(f)) + for f in self.apscheduler_job.trigger.fields) + start_date = self.apscheduler_job.trigger.start_date + self.js['data']['apscheduler_job']['trigger'].update(dict( + start_date=str(start_date) if start_date else None, + end_date=str(self.apscheduler_job.trigger.end_date) if self.apscheduler_job.trigger.end_date else None, + timezone=str(self.apscheduler_job.trigger.timezone) if self.apscheduler_job.trigger.timezone else None, + jitter=self.apscheduler_job.trigger.jitter, + )) + self.js['tip'] = "apscheduler_job #{id} found. Task #{id} found. ".format(id=self.task_id) + + def list_tasks_or_results(self): + if self.task_id and self.task_result_id: + records = TaskJobResult.query.filter_by(task_result_id=self.task_result_id).all() + elif self.task_id: + records = TaskResult.query.filter_by(task_id=self.task_id).all() + else: + records = Task.query.all() + self.js['ids'] = [i.id for i in records] diff --git a/views/system/__init__.py b/views/system/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/views/system/settings.py b/views/system/settings.py new file mode 100644 index 00000000..7873cf21 --- /dev/null +++ b/views/system/settings.py @@ -0,0 +1,184 @@ +# coding: utf-8 +from collections import OrderedDict, defaultdict +import re + +from flask import render_template +from logparser import SETTINGS_PY_PATH as LOGPARSER_SETTINGS_PY_PATH + +from ...common import json_dumps +from ...vars import SCHEDULER_STATE_DICT +from ..baseview import BaseView + + +class SettingsView(BaseView): + methods = ['GET'] + + def __init__(self): + super(SettingsView, self).__init__() + + self.template = 'scrapydweb/settings.html' + self.kwargs = dict(node=self.node) + + def dispatch_request(self, **kwargs): + self.update_kwargs() + return render_template(self.template, **self.kwargs) + + @staticmethod + def json_dumps(obj, sort_keys=False): + string = json_dumps(obj, sort_keys=sort_keys) + return string.replace(' true', ' True').replace(' false', ' False').replace(' null', ' None') + + @staticmethod + def protect(string): + if not isinstance(string, str): + return string + length = len(string) + if length < 4: + return '*' * length + elif length < 12: + return ''.join([string[i] if not i%2 else '*' for i in range(0, length)]) + else: + return re.sub(r'^.{4}(.*?).{4}$', r'****\1****', string) + + @staticmethod + def hide_account(string): + return re.sub(r'//.+@', '//', string) + + def update_kwargs(self): + # User settings + self.kwargs['DEFAULT_SETTINGS_PY_PATH'] = self.handle_slash(self.DEFAULT_SETTINGS_PY_PATH) + self.kwargs['SCRAPYDWEB_SETTINGS_PY_PATH'] = self.handle_slash(self.SCRAPYDWEB_SETTINGS_PY_PATH) + self.kwargs['MAIN_PID'] = self.MAIN_PID + self.kwargs['LOGPARSER_PID'] = self.LOGPARSER_PID + self.kwargs['POLL_PID'] = self.POLL_PID + + # ScrapydWeb + self.kwargs['scrapydweb_server'] = self.json_dumps(dict( + SCRAPYDWEB_BIND=self.SCRAPYDWEB_BIND, + SCRAPYDWEB_PORT=self.SCRAPYDWEB_PORT, + URL_SCRAPYDWEB=self.URL_SCRAPYDWEB, + ENABLE_AUTH=self.ENABLE_AUTH, + USERNAME=self.protect(self.USERNAME), + PASSWORD=self.protect(self.PASSWORD) + )) + self.kwargs['ENABLE_HTTPS'] = self.ENABLE_HTTPS + self.kwargs['enable_https_details'] = self.json_dumps(dict( + CERTIFICATE_FILEPATH=self.CERTIFICATE_FILEPATH, + PRIVATEKEY_FILEPATH=self.PRIVATEKEY_FILEPATH + )) + + # Scrapy + self.kwargs['SCRAPY_PROJECTS_DIR'] = self.handle_slash(self.SCRAPY_PROJECTS_DIR) or "''" + + # Scrapyd + servers = defaultdict(list) + for group, server, auth in zip(self.SCRAPYD_SERVERS_GROUPS, self.SCRAPYD_SERVERS, self.SCRAPYD_SERVERS_AUTHS): + _server = '%s:%s@%s' % (self.protect(auth[0]), self.protect(auth[1]), server) if auth else server + servers[group].append(_server) + + self.kwargs['servers'] = self.json_dumps(servers) + self.kwargs['LOCAL_SCRAPYD_SERVER'] = self.LOCAL_SCRAPYD_SERVER or "''" + self.kwargs['LOCAL_SCRAPYD_LOGS_DIR'] = self.handle_slash(self.LOCAL_SCRAPYD_LOGS_DIR) or "''" + self.kwargs['SCRAPYD_LOG_EXTENSIONS'] = self.SCRAPYD_LOG_EXTENSIONS + + # LogParser + self.kwargs['ENABLE_LOGPARSER'] = self.ENABLE_LOGPARSER + self.kwargs['logparser_version'] = self.LOGPARSER_VERSION + self.kwargs['logparser_settings_py_path'] = self.handle_slash(LOGPARSER_SETTINGS_PY_PATH) + self.kwargs['BACKUP_STATS_JSON_FILE'] = self.BACKUP_STATS_JSON_FILE + + # Timer Tasks + self.kwargs['scheduler_state'] = SCHEDULER_STATE_DICT[self.scheduler.state] + self.kwargs['JOBS_SNAPSHOT_INTERVAL'] = self.JOBS_SNAPSHOT_INTERVAL + + # Run Spider + self.kwargs['run_spider_details'] = self.json_dumps(dict( + SCHEDULE_EXPAND_SETTINGS_ARGUMENTS=self.SCHEDULE_EXPAND_SETTINGS_ARGUMENTS, + SCHEDULE_CUSTOM_USER_AGENT=self.SCHEDULE_CUSTOM_USER_AGENT, + SCHEDULE_USER_AGENT=self.SCHEDULE_USER_AGENT, + SCHEDULE_ROBOTSTXT_OBEY=self.SCHEDULE_ROBOTSTXT_OBEY, + SCHEDULE_COOKIES_ENABLED=self.SCHEDULE_COOKIES_ENABLED, + SCHEDULE_CONCURRENT_REQUESTS=self.SCHEDULE_CONCURRENT_REQUESTS, + SCHEDULE_DOWNLOAD_DELAY=self.SCHEDULE_DOWNLOAD_DELAY, + SCHEDULE_ADDITIONAL=self.SCHEDULE_ADDITIONAL + )) + + # Page Display + self.kwargs['page_display_details'] = self.json_dumps(dict( + SHOW_SCRAPYD_ITEMS=self.SHOW_SCRAPYD_ITEMS, + SHOW_JOBS_JOB_COLUMN=self.SHOW_JOBS_JOB_COLUMN, + JOBS_FINISHED_JOBS_LIMIT=self.JOBS_FINISHED_JOBS_LIMIT, + JOBS_RELOAD_INTERVAL=self.JOBS_RELOAD_INTERVAL, + DAEMONSTATUS_REFRESH_INTERVAL=self.DAEMONSTATUS_REFRESH_INTERVAL + )) + + # Send text + self.kwargs['slack_details'] = self.json_dumps(dict( + SLACK_TOKEN=self.protect(self.SLACK_TOKEN), + SLACK_CHANNEL=self.SLACK_CHANNEL + )) + self.kwargs['telegram_details'] = self.json_dumps(dict( + TELEGRAM_TOKEN=self.protect(self.TELEGRAM_TOKEN), + TELEGRAM_CHAT_ID=self.TELEGRAM_CHAT_ID + )) + self.kwargs['email_details'] = self.json_dumps(dict( + EMAIL_SUBJECT=self.EMAIL_SUBJECT, + )) + self.kwargs['email_sender_recipients'] = self.json_dumps(dict( + EMAIL_USERNAME=self.EMAIL_USERNAME, + EMAIL_PASSWORD=self.protect(self.EMAIL_PASSWORD), + EMAIL_SENDER=self.EMAIL_SENDER, + EMAIL_RECIPIENTS=self.EMAIL_RECIPIENTS + )) + self.kwargs['email_smtp_settings'] = self.json_dumps(dict( + SMTP_SERVER=self.SMTP_SERVER, + SMTP_PORT=self.SMTP_PORT, + SMTP_OVER_SSL=self.SMTP_OVER_SSL, + SMTP_CONNECTION_TIMEOUT=self.SMTP_CONNECTION_TIMEOUT, + )) + + + # Monitor & Alert + self.kwargs['ENABLE_MONITOR'] = self.ENABLE_MONITOR + self.kwargs['poll_interval'] = self.json_dumps(dict( + POLL_ROUND_INTERVAL=self.POLL_ROUND_INTERVAL, + POLL_REQUEST_INTERVAL=self.POLL_REQUEST_INTERVAL, + )) + self.kwargs['alert_switcher'] = self.json_dumps(dict( + ENABLE_SLACK_ALERT=self.ENABLE_SLACK_ALERT, + ENABLE_TELEGRAM_ALERT=self.ENABLE_TELEGRAM_ALERT, + ENABLE_EMAIL_ALERT=self.ENABLE_EMAIL_ALERT, + )) + self.kwargs['alert_working_time'] = self.json_dumps([ + dict( + ALERT_WORKING_DAYS="%s" % sorted(self.ALERT_WORKING_DAYS), # stringify making it displayed in a line + remark="Monday is 1 and Sunday is 7" + ), + dict( + ALERT_WORKING_HOURS="%s" % sorted(self.ALERT_WORKING_HOURS), + remark="From 0 to 23" + ) + ]) + # alert triggers + d = OrderedDict() + d['ON_JOB_RUNNING_INTERVAL'] = self.ON_JOB_RUNNING_INTERVAL + d['ON_JOB_FINISHED'] = self.ON_JOB_FINISHED + + for key in self.ALERT_TRIGGER_KEYS: + keys = ['LOG_%s_THRESHOLD' % key, 'LOG_%s_TRIGGER_STOP' % key, 'LOG_%s_TRIGGER_FORCESTOP' % key] + d[key] = {k: getattr(self, k) for k in keys} + value = self.json_dumps(d) + value = re.sub(r'True', "True", value) + value = re.sub(r'(\s[1-9]\d*)', r"\1", value) + self.kwargs['alert_triggers'] = value + + # System + self.kwargs['DEBUG'] = self.DEBUG + self.kwargs['VERBOSE'] = self.VERBOSE + self.kwargs['DATA_PATH'] = self.DATA_PATH + self.kwargs['database_details'] = self.json_dumps(dict( + APSCHEDULER_DATABASE_URI=self.hide_account(self.APSCHEDULER_DATABASE_URI), + SQLALCHEMY_DATABASE_URI=self.hide_account(self.SQLALCHEMY_DATABASE_URI), + SQLALCHEMY_BINDS_METADATA=self.hide_account(self.SQLALCHEMY_BINDS['metadata']), + SQLALCHEMY_BINDS_JOBS=self.hide_account(self.SQLALCHEMY_BINDS['jobs']) + )) diff --git a/views/utilities/__init__.py b/views/utilities/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/views/utilities/parse.py b/views/utilities/parse.py new file mode 100644 index 00000000..57378143 --- /dev/null +++ b/views/utilities/parse.py @@ -0,0 +1,104 @@ +# coding: utf-8 +import io +import os +import re + +from flask import Blueprint, flash, redirect, render_template, request, send_from_directory, url_for +from logparser import parse +from werkzeug.utils import secure_filename + +from ...vars import PARSE_PATH +from ..baseview import BaseView + + +ALLOWED_EXTENSIONS = {'log', 'txt'} +bp = Blueprint('parse', __name__, url_prefix='/') + + +@bp.route('/parse/source/') +def source(filename): + return send_from_directory(PARSE_PATH, filename, mimetype='text/plain', cache_timeout=0) + + +class UploadLogView(BaseView): + + def __init__(self): + super(UploadLogView, self).__init__() + + self.template = 'scrapydweb/parse.html' + + def dispatch_request(self, **kwargs): + if self.POST: + file = request.files.get('file') + if not file: + flash('No file selected', self.WARN) + return redirect(request.url) + + if file.filename == '': + flash('Filename not found', self.WARN) + return redirect(request.url) + + if file.filename.rpartition('.')[-1] not in ALLOWED_EXTENSIONS: + flash('Only file type of %s is supported' % ALLOWED_EXTENSIONS, self.WARN) + return redirect(request.url) + + # Non-ASCII would be omitted and may set the filename as 'log' or 'txt' + filename = secure_filename(file.filename) + if filename in ALLOWED_EXTENSIONS: + filename = '%s.%s' % (self.get_now_string(), filename) + file.save(os.path.join(self.PARSE_PATH, filename)) + + return redirect(url_for('.uploaded', node=self.node, filename=filename)) + else: + url_parse_demo = url_for('.uploaded', node=self.node, filename='ScrapydWeb_demo.log') + return render_template(self.template, node=self.node, url_parse_demo=url_parse_demo) + + +class UploadedLogView(BaseView): + + def __init__(self): + super(UploadedLogView, self).__init__() + + self.filename = self.view_args['filename'] + self.template = 'scrapydweb/stats.html' + + self.text = '' + self.project = '' + self.spider = '' + self.job = '' + + def dispatch_request(self, **kwargs): + try: + # Use io.open for compatibility with Python 2 + with io.open(os.path.join(self.PARSE_PATH, self.filename), encoding='utf-8', errors='ignore') as f: + self.text = f.read() + except Exception as err: + return render_template(self.template_fail, node=self.node, + alert="An error occurred when reading the uploaded logfile", + text='%s\n%s' % (err.__class__.__name__, err)) + + self.get_job_info() + + kwargs = dict( + project=self.project, + spider=self.spider, + job=self.job, + url_source=url_for('.source', filename=self.filename), + # url_utf8=url_utf8, # To hide url_utf8 link in page http://127.0.0.1:5000/log/uploaded/ScrapydWeb_demo.log + ) + kwargs.update(parse(self.text)) + # self.logger.debug("Parsed result: %s" % self.json_dumps(kwargs)) + return render_template(self.template, node=self.node, **kwargs) + + def get_job_info(self): + # 2018-08-21 12:21:45 [scrapy.utils.log] INFO: Scrapy 1.5.0 started (bot: proxy) + m_project = re.search(r'\(bot:\s(.+?)\)', self.text) + self.project = m_project.group(1) if m_project else self.NA + + # 2018-08-21 12:21:45 [test] DEBUG: from_crawler + m_spider = re.search(r'\[([^.]+?)\]\s+(?:DEBUG|INFO|WARNING|ERROR|CRITICAL)', self.text) + self.spider = m_spider.group(1) if m_spider else self.NA + + # 'LOG_FILE': 'logs\\proxy\\test\\b2095ab0a4f911e8b98614dda9e91c2f.log', + m_job = re.search(r'LOG_FILE.*?([\w-]+)\.(?:log|txt)', self.text) + self.job = m_job.group(1) if m_job else (self.filename.rpartition('.')[0] or self.filename) diff --git a/views/utilities/send_text.py b/views/utilities/send_text.py new file mode 100644 index 00000000..6a8f1120 --- /dev/null +++ b/views/utilities/send_text.py @@ -0,0 +1,138 @@ +# coding: utf-8 +import re + +from flask import render_template, request, url_for + +from ...utils.send_email import send_email +from ..baseview import BaseView + + +class SendTextView(BaseView): + + def __init__(self): + super(SendTextView, self).__init__() + + self.template = 'scrapydweb/send_text.html' + + def dispatch_request(self, **kwargs): + kwargs = dict( + node=self.node, + url_slack=url_for('sendtextapi', opt='slack', channel_chatid_subject=None, text='some-text'), + url_telegram=url_for('sendtextapi', opt='telegram', channel_chatid_subject=None, text='some-text'), + url_email=url_for('sendtextapi', opt='email', channel_chatid_subject=None, text='some-text'), + ) + return render_template(self.template, **kwargs) + + +class SendTextApiView(BaseView): + # https://api.slack.com/methods/chat.postMessage + # https://www.codementor.io/garethdwyer/building-a-telegram-bot-using-python-part-1-goi5fncay + # slack_help = 'https://api.slack.com/apps' + # telegram_help = 'https://core.telegram.org/bots#6-botfather' + + def __init__(self): + super(SendTextApiView, self).__init__() + + self.opt = self.view_args['opt'] + self.opt = 'telegram' if self.opt == 'tg' else self.opt + # https://stackoverflow.com/questions/10434599/get-the-data-received-in-a-flask-request + # request.values: combined args and form, preferring args if keys overlap + self.form = request.json or request.form + + if self.opt == 'email': + self.channel_chatid_subject = (self.view_args['channel_chatid_subject'] + or request.args.get('subject', None) + or self.form.get('subject', self.EMAIL_SUBJECT)) + # request.json['recipients'] could be a list type instead of a string type + recipients = re.findall(r'[^\s"\',;\[\]]+@[^\s"\',;\[\]]+', + request.args.get('recipients', '') or str(self.form.get('recipients', ''))) + self.EMAIL_KWARGS['email_recipients'] = recipients or self.EMAIL_RECIPIENTS + elif self.opt == 'slack': + self.channel_chatid_subject = (self.view_args['channel_chatid_subject'] + or request.args.get('channel', None) + or self.form.get('channel', self.SLACK_CHANNEL)) # 'general' + else: + self.channel_chatid_subject = (self.view_args['channel_chatid_subject'] + or request.args.get('chat_id', None) + or self.form.get('chat_id', self.TELEGRAM_CHAT_ID)) + self.logger.debug('channel_chatid_subject: %s', self.channel_chatid_subject) + + self.text = self.view_args['text'] or request.args.get('text', None) + if not self.text: + self.text = self.json_dumps(self.form) if self.form else 'test' + self.logger.debug('text: %s', self.text) + + self.js = {} + self.tested = False # For test only + + def dispatch_request(self, **kwargs): + if self.opt == 'email': + self.send_email() + elif self.opt == 'slack': + self.send_slack() + elif self.opt == 'telegram': + self.send_telegram() + self.js['when'] = self.get_now_string(True) + return self.json_dumps(self.js, as_response=True) + + def send_email(self): + if not self.EMAIL_PASSWORD: + self.js = dict(status=self.ERROR, result="The EMAIL_PASSWORD option is unset") + return + self.EMAIL_KWARGS['subject'] = self.channel_chatid_subject + self.EMAIL_KWARGS['content'] = self.text + result, reason = send_email(to_retry=True, **self.EMAIL_KWARGS) + if result is True: + self.logger.debug("Sent to %s via Email", self.EMAIL_KWARGS['email_recipients']) + self.js = dict(status=self.OK, + result=dict(reason=reason, sender=self.EMAIL_KWARGS['email_sender'], + recipients=self.EMAIL_KWARGS['email_recipients'], + subject=self.channel_chatid_subject, text=self.text)) + else: + self.js = dict(status=self.ERROR, result=dict(reason=reason), debug=self.EMAIL_KWARGS) + self.logger.error("Fail to send text via Email:\n%s", self.json_dumps(self.js)) + + def send_slack(self): + if not self.SLACK_TOKEN: + self.js = dict(status=self.ERROR, result="The SLACK_TOKEN option is unset") + return + url = 'https://slack.com/api/chat.postMessage' + data = dict(token=self.SLACK_TOKEN, channel=self.channel_chatid_subject, text=self.text) + status_code, js = self.make_request(url, data=data, check_status=False) + for key in ['auth', 'status', 'status_code', 'url', 'when']: + js.pop(key, None) + self.js = dict(url=url, status_code=status_code, result=js) + # {"ok":false,"error":"invalid_auth"} + # {"ok":false,"error":"channel_not_found"} + # {"ok":false,"error":"no_text"} + if js.get('ok', False): + self.logger.debug("Sent to bot %s via Slack", js.get('message', {}).get('username', '')) + self.js['status'] = self.OK + else: + self.js['status'] = self.ERROR + if self.SLACK_TOKEN: + self.js['debug'] = dict(token=self.SLACK_TOKEN, channel=self.channel_chatid_subject, + text=self.text) + self.logger.error("Fail to send text via Slack:\n%s", self.json_dumps(self.js)) + + def send_telegram(self): + if not self.TELEGRAM_TOKEN: + self.js = dict(status=self.ERROR, result="The TELEGRAM_TOKEN option is unset") + return + url = 'https://api.telegram.org/bot%s/sendMessage' % self.TELEGRAM_TOKEN + data = dict(text=self.text, chat_id=self.channel_chatid_subject) + status_code, js = self.make_request(url, data=data, check_status=False) + + for key in ['auth', 'status', 'status_code', 'url', 'when']: + js.pop(key, None) + self.js = dict(url=url, status_code=status_code, result=js) + if js.get('ok', False): + self.logger.debug("Sent to %s via Telegram", js.get('result', {}).get('chat', {}).get('first_name', '')) + self.js['status'] = self.OK + # {"ok":false,"error_code":400,"description":"Bad Request: chat not found"} + else: + self.js['status'] = self.ERROR + if self.TELEGRAM_TOKEN: + self.js['debug'] = dict(token=self.TELEGRAM_TOKEN, chat_id=self.channel_chatid_subject, + text=self.text) + self.logger.error("Fail to send text via Telegram:\n%s", self.json_dumps(self.js))