tg-andev/rss.py at master · runo280/tg-andev · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import feedparser
import requests

import db
import telegram


def get_feeds_url():
    return [row.rstrip('\n') for row in open('feeds.txt')]


def is_url_ok(url, report=False):
    try:
        r = requests.head(url)
        if r.status_code == 200:
            print('Url is OK: ' + url)
            return True
        else:
            print('StatusCode is {}: {} '.format(r.status_code, url))
            if report:
                telegram.msg_to_admin('⚠️ StatusCode is ' + str(r.status_code) + ':\n' + url)
            return True
    except requests.ConnectionError:
        print('Failed to connect: ' + url)
        if report:
            telegram.msg_to_admin('🚫 Failed to connect:\n' + url)
        return False


def read_article_feed(feed_url):
    if not is_url_ok(feed_url):
        return
    try:
        feed = feedparser.parse(feed_url)
        print('Count is ' + str(len(feed['entries'])))
        first_crawl = should_published(feed_url)
        for article in feed['entries']:
            title = article['title']
            link = article['link']

            if 'published' in article:
                date = article['published']
            else:
                date = article['updated']

            if 'feedproxy.google.com' in link:
                link = get_redirect_url(link)

            if not is_article_in_db(link):
                add_article_to_db(title, link, date, first_crawl)
    except():
        print()
        telegram.msg_to_admin('⛔️ Failed to parse:\n' + feed_url)


def get_redirect_url(url):
    return requests.get(url).url


def is_article_in_db(url):
    query = {'link': url}
    if db.feeds.count_documents(query) == 0:
        return False
    else:
        return True


def should_published(url):
    query = {'link': url}
    if db.urls.count_documents(query) == 0:
        new_url = {'link': url}
        x = db.urls.insert_one(new_url)
        return True
    else:
        return False


def add_article_to_db(title, link, date, is_pub):
    article = {'title': title, 'link': link, 'date': date, 'is_pub': is_pub}
    x = db.feeds.insert_one(article)
    print(x.inserted_id)


if __name__ == '__main__':
    lines = get_feeds_url()
    print('Feeds count: ' + str(len(lines)))
    index = 0
    for line in lines:
        if line.startswith('#'):
            continue
        index += 1
        print('Processing feed #' + str(index) + ' : ' + line)
        read_article_feed(line)

    # db.mark_as_read_all()