Skip to content
Open
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .python-version
Original file line number Diff line number Diff line change
@@ -1 +1 @@
3.6.8
3.11
4 changes: 3 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
language: python
dist: jammy
python:
- "3.6"
- "3.11"
# command to install dependencies
install:
- make dev
# command to run tests
script:
- make tests
- make coverage
14 changes: 9 additions & 5 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@

PYTHON=venv/bin/python3
PIP=venv/bin/pip
NOSE=venv/bin/nosetests
COVERAGE=venv/bin/coverage
TEST_RUNNER=venv/bin/pytest
TEST_RUNNER_FLAGS=-s --durations=3 --durations-min=0.005
FLAKE=venv/bin/flake8
PYPICLOUD_HOST=pypicloud.getkeepsafe.local
PIP_ARGS=--extra-index=http://$(PYPICLOUD_HOST)/simple/ --trusted-host $(PYPICLOUD_HOST)
Expand Down Expand Up @@ -30,14 +32,16 @@ flake:
$(FLAKE) validator tests

test: flake
$(NOSE) -s $(FLAGS)
$(COVERAGE) run -m pytest $(TEST_RUNNER_FLAGS)

vtest:
$(NOSE) -s -v $(FLAGS)
$(COVERAGE) run -m pytest -v $(TEST_RUNNER_FLAGS)

testloop:
while sleep 1; do $(TEST_RUNNER) -s --lf $(TEST_RUNNER_FLAGS); done

cov cover coverage:
$(NOSE) -s --with-cover --cover-html --cover-html-dir ./coverage $(FLAGS)
echo "open file://`pwd`/coverage/index.html"
$(COVERAGE) report -m

clean:
rm -rf `find . -name __pycache__`
Expand Down
6 changes: 6 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,9 @@ ignore = F403

[pep8]
max-line-length = 120

[coverage:run]
branch = True

[coverage:report]
fail_under = 96
16 changes: 8 additions & 8 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,26 @@
import os
from setuptools import setup, find_packages


version = '0.7.1'
version = '1.0.0'


def read(f):
return open(os.path.join(os.path.dirname(__file__), f)).read().strip()


install_requires = [
'sdiff @ git+https://github.com/KeepSafe/html-structure-diff.git@0.4.1#egg=sdiff',
'aiohttp >=3, <3.4',
'sdiff @ git+https://github.com/KeepSafe/html-structure-diff.git@1.0.0#egg=sdiff',
'aiohttp==3.8.5',
'Markdown',
'parse >=1, <2',
'beautifulsoup4 >=4, <5',
'lxml >=3',
'lxml<5',
]

tests_require = [
'nose',
'flake8==3.6.0',
'coverage',
'pytest >= 8',
'coverage==7.6.1',
'flake8==7.1.1',
]

devtools_require = [
Expand All @@ -32,6 +31,7 @@ def read(f):
setup(
name='content-validator',
version=version,
python_requires='>=3.11',
description=('Content validator looks at text content and preforms different validation tasks'),
classifiers=[
'License :: OSI Approved :: BSD License', 'Intended Audience :: Developers', 'Programming Language :: Python'
Expand Down
1 change: 0 additions & 1 deletion tests/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

def read(path):
with open(path) as fp:
return fp.read()
10 changes: 5 additions & 5 deletions validator/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from . import parsers, checks, reports, fs


class Validator(object):
class Validator:
def __init__(self, contents, parser, reader, check, reporter=None):
self.contents = contents
self.parser = parser
Expand All @@ -24,7 +24,7 @@ async def async_validate(self):
return errors


class ReportBuilder(object):
class ReportBuilder:
def __init__(self, contents, parser, reader, check):
self.contents = contents
self.parser = parser
Expand All @@ -49,7 +49,7 @@ def validate(self):
return Validator(self.contents, self.parser, self.reader, self.check, reporter).validate()


class CheckBuilder(object):
class CheckBuilder:
def __init__(self, contents, content_type, parser, reader):
self.contents = contents
self.content_type = content_type
Expand Down Expand Up @@ -89,7 +89,7 @@ async def async_validate(self):
return res


class ParserBuilder(object):
class ParserBuilder:
def __init__(self, contents, reader=None):
self.contents = contents
self.content_type = 'txt'
Expand Down Expand Up @@ -120,7 +120,7 @@ def check(self):
return CheckBuilder(self.contents, self.content_type, parser, self.reader)


class ContentBuilder(object):
class ContentBuilder:
def files(self, pattern, **kwargs):
contents = fs.files(pattern, **kwargs)
return ParserBuilder(contents, parsers.FileReader())
Expand Down
6 changes: 2 additions & 4 deletions validator/checks/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
from typing import Type

from sdiff import MdParser

from .md import MarkdownComparator
Expand All @@ -21,7 +19,7 @@ def url_occurences(filetype):
return UrlOccurenciesValidator()


def markdown(filetype, md_parser_cls: Type[MdParser] = MdParser):
def markdown(filetype, md_parser_cls: type[MdParser] = MdParser):
if filetype not in ['txt', 'html']:
raise UndefinedCheckTypeError('got filetype %s' % filetype)
return MarkdownComparator(md_parser_cls)
Expand All @@ -33,7 +31,7 @@ def java_args(filetype):
return JavaComparator()


class ChainCheck(object):
class ChainCheck:
def __init__(self, checks):
self.checks = checks

Expand Down
2 changes: 1 addition & 1 deletion validator/checks/java.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
REF_PATTERN = r'@string/\w+'


class JavaComparator(object):
class JavaComparator:
def _get_args(self, content):
return re.findall(ARG_PATTERN, content)

Expand Down
5 changes: 2 additions & 3 deletions validator/checks/md.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import re
from typing import Type

from sdiff import diff, renderer, MdParser
from markdown import markdown
Expand All @@ -14,8 +13,8 @@ def save_file(content, filename):
fp.write(content)


class MarkdownComparator(object):
def __init__(self, md_parser_cls: Type[MdParser] = MdParser):
class MarkdownComparator:
def __init__(self, md_parser_cls: type[MdParser] = MdParser):
self._md_parser_cls = md_parser_cls

def check(self, data, parser, reader):
Expand Down
25 changes: 12 additions & 13 deletions validator/checks/url.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import string
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
from typing import List, Optional

from ..errors import UrlDiff, UrlOccurencyDiff

Expand All @@ -23,7 +22,7 @@ class MissingUrlExtractorError(Exception):
# the job of extractors is to find all non-parametrized urls in the given text for later checks via UrlValidator
# which examines is particular url leads to working webpage (200 status)
# since we are interested in all urls (including parametrized) we need to sligthly change their API and behaviour
class TextUrlExtractor(object):
class TextUrlExtractor:
def __init__(self, **kwargs):
pass

Expand Down Expand Up @@ -60,12 +59,12 @@ def _validate_email(self, email):
return False

def _extract_from_anchors(self, soup):
return set([a.get('href') or a.text for a in soup.find_all('a')])
return {a.get('href') or a.text for a in soup.find_all('a')}

def _extract_from_img(self, soup):
if self.skip_images:
return set()
return set([img.get('src') for img in soup.find_all('img')])
return {img.get('src') for img in soup.find_all('img')}

def _fix_url(self, url):
result = ''
Expand All @@ -82,7 +81,7 @@ def _fix_url(self, url):
if re.match(self.url_pattern, full_url):
result = full_url
else:
logging.error('{} not tested'.format(url_parsed.geturl()))
logging.error(f'{url_parsed.geturl()} not tested')
return result

def extract_urls(self, content, keep_placeholders=False):
Expand All @@ -96,20 +95,20 @@ def extract_urls(self, content, keep_placeholders=False):
return result


class UrlStatusChecker(object):
class UrlStatusChecker:
retry_max_count = 3

def __init__(self, headers=None, exclude_urls_regexs: Optional[List[str]] = None):
def __init__(self, headers=None, exclude_urls_regexs: list[str] | None = None):
self._exclude_urls_regex = exclude_urls_regexs or []
if self._exclude_urls_regex:
logging.warning('Excluded urls regexps: {}'.format(self._exclude_urls_regex))
logging.warning(f'Excluded urls regexps: {self._exclude_urls_regex}')
self._headers = headers or {}
if 'User-Agent' not in self._headers:
self._headers['User-Agent'] = DEFAULT_USER_AGENT

async def _make_request(self, url):
try:
logging.info('checking {}'.format(url))
logging.info(f'checking {url}')
async with aiohttp.request('get', url, headers=self._headers, allow_redirects=True) as res:
return res.status
except Exception:
Expand Down Expand Up @@ -143,7 +142,7 @@ async def _check_urls_coro(self, urls, future):
if not is_exluded:
urls_without_excluded.append(url)
else:
logging.warning('url {} excluded from status check'.format(url.url))
logging.warning(f'url {url.url} excluded from status check')
tasks = [self._request_status_code(url.url) for url in urls_without_excluded]
results = await asyncio.gather(*tasks)
for index, url in enumerate(urls_without_excluded):
Expand All @@ -167,10 +166,10 @@ async def async_check(self, urls):
return future.result()


class UrlValidator(object):
class UrlValidator:
_extractors = {'txt': TextUrlExtractor, 'html': HtmlUrlExtractor}

def __init__(self, filetype, headers=None, exclude_status_check_regexs: Optional[List[str]] = None, **kwargs):
def __init__(self, filetype, headers=None, exclude_status_check_regexs: list[str] | None = None, **kwargs):
self.client_headers = headers or {}
self._excluded_status_check_regexs = exclude_status_check_regexs or []
extractor_class = self._extractors.get(filetype)
Expand All @@ -179,7 +178,7 @@ def __init__(self, filetype, headers=None, exclude_status_check_regexs: Optional
self.extractor = extractor_class(**kwargs)

def _get_urls(self, data, parser, reader):
flat_data = set(p for sublist in data for p in sublist)
flat_data = {p for sublist in data for p in sublist}
# TODO yield instead
urls = {}
for element in flat_data:
Expand Down
6 changes: 3 additions & 3 deletions validator/errors.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from collections import namedtuple


class UrlDiff(object):
class UrlDiff:

def __init__(self, url, files=None, status_code=200, has_disallowed_chars=False):
self.url = url
Expand All @@ -10,7 +10,7 @@ def __init__(self, url, files=None, status_code=200, has_disallowed_chars=False)
self.has_disallowed_chars = has_disallowed_chars

def __str__(self):
return 'Url(%s, %s, %s, %s)' % (self.url, self.files, self.status_code, self.has_disallowed_chars)
return 'Url({}, {}, {}, {})'.format(self.url, self.files, self.status_code, self.has_disallowed_chars)

def __repr__(self):
return 'Url: %s' % self.url
Expand All @@ -37,7 +37,7 @@ def is_valid(self):
ContentData.__new__.__defaults__ = ('', ) * 2


class MdDiff(object):
class MdDiff:

def __init__(self, base, other, error_msgs):
self.base = base
Expand Down
4 changes: 2 additions & 2 deletions validator/fs.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,10 +91,10 @@ def files(pattern, **kwargs):
[[Path(path/to1/file1.txt), Path(path/to1/file2.txt)], [Path(path/to2/file1.txt), Path(path/to2/file2.txt)]]
"""
# extract named parameters from the pattern
params = set([p for p in map(lambda e: e[1], Formatter().parse(pattern)) if p])
params = {p for p in map(lambda e: e[1], Formatter().parse(pattern)) if p}
if params:
if len(params - kwargs.keys()) > 0:
raise ValueError('missing parameters {} for pattern {}'.format(params - kwargs.keys(), pattern))
raise ValueError(f'missing parameters {params - kwargs.keys()} for pattern {pattern}')
return _params_pattern(pattern, params, **kwargs)
else:
return _no_params_pattern(pattern)
Expand Down
12 changes: 6 additions & 6 deletions validator/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,22 +9,22 @@ def __init__(self, msg):
super().__init__(msg)


class FileReader(object):
class FileReader:
def read(self, path):
return read_content(path)


class TxtReader(object):
class TxtReader:
def read(self, content):
return content


class MarkdownParser(object):
class MarkdownParser:
def parse(self, content):
return markdown.markdown(content)


class XmlParser(object):
class XmlParser:
def __init__(self, query='*'):
self.query = query

Expand All @@ -38,12 +38,12 @@ def parse(self, content):
return '\n\n'.join(texts)


class CsvParser(object):
class CsvParser:
def parse(self, content):
return '\n'.join(content.split(','))


class ChainParser(object):
class ChainParser:
def __init__(self, parsers):
self.parsers = parsers

Expand Down
Loading