Large-Language-Models/crawl_directory.py at main · VincentGranville/Large-Language-Models · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
import requests
import time

# [ToDo] index: https://mathworld.wolfram.com/letters/
# [ToDo] search: https://mathworld.wolfram.com/search/?query=falling+factorial
# [Done] categories: https://mathworld.wolfram.com/topics/
# [ToDo] for final URLs: check "see also" links
#
# crawling using proxy server: https://www.scrapingbee.com/blog/python-requests-proxy/

URL_list = []
URL_parent_Category = {}
categoryLevel = {}
history = {}
final_URLs = {}

URL_base1 = "https://mathworld.wolfram.com/topics/"  # for directory pages (root)
URL_base2 = "https://mathworld.wolfram.com/"         # for final pages

seed_URL = "https://mathworld.wolfram.com/topics/ProbabilityandStatistics.html"
seed_category = "Probability and Statistics"  # "Root" if starting at URL_base1
categoryLevel[seed_category] = 1  # set to 0 if starting at URL_base1

# seed_URL = "https://mathworld.wolfram.com/topics/"
# seed_category = "Root"  # "Root" if starting at URL_base1
# categoryLevel[seed_category] = 0  # set to 0 if starting at URL_base1

URL_list.append(seed_URL)   # URL stack
URL_parent_Category[seed_URL] = seed_category

parsed = 0       # number of URLs already parsed
n_URLs = 1       # total number of URLs in the queue
max_URLs = 5000  # do not crawl more than max_URLs directory pages

def validate(string):
    Ignore = ['about/','classroom/','contact/','whatsnew/','letters/']
    validated = True
    if len(string) > 60 or string in Ignore or string.count('topics') > 0:
        validated = False
    return(validated)

def update_lists(new_URL, new_category, parent_category, file):
    URL_parent_Category[new_URL] = new_category
    # if new_category was encountered before, special processing required
    #   --> in this case, not a one-to-one mapping (ignore here)
    categoryLevel[new_category] = 1 + categoryLevel[parent_category]
    level = str(categoryLevel[new_category])
    file.write(level+"\t"+new_category+"\t"+parent_category+"\n")
    file.flush()
    return()


#---[1] Creating category structure and list of webpages

# file1 allows you to resume from where it stopped in case of crash

file1 = open("crawl_log.txt","w",encoding="utf-8")
file2 = open("crawl_categories.txt","w",encoding="utf-8")

while parsed < min(max_URLs, n_URLs):

    URL = URL_list[parsed]    # crawl first non-visited URL on the stack
    parent_category = URL_parent_Category[URL]
    level = categoryLevel[parent_category]
    time.sleep(2.5)  #  slow down crawling to avoid being blocked
    parsed += 1

    if URL in history:

        # do not crawl twice the same URL
        print("Duplicate: %s" %(URL))
        file1.write(URL+"\tDuplicate\t"+parent_category+"\t"+str(level)+"\n")

    else:

        print("Parsing: %5d out of %5d: %s" % (parsed, n_URLs, URL))
        # req = requests.get(server, auth=('user',"pass"))
        resp = requests.get(URL, timeout=5)
        history[URL] = resp.status_code

        if resp.status_code != 200:

            print("Failed: %s" %(URL))
            file1.write(URL+"\tError:"+str(resp.status_code)+"\t"+parent_category+"\t"+str(level)+"\n")
            file1.flush()

        else: # URL successfully crawled

            file1.write(URL+"\tParsed\t"+parent_category+"\t"+str(level)+"\n")
            page = resp.text
            page = page.replace('\n', ' ')
            page1 = page.split("<a href=\"/topics/")
            page2 = page.split("<a href=\"/")
            n_URLs_old = n_URLs

            # scraping Type-1 page (intermediate directory node)

            for line in page1:
                line = line.split("<span>")
                line = line[0]
                if line.count(">") == 1:
                    line = line.split("\">")
                    if len(line) > 1:
                        new_URL = URL_base1 + line[0]
                        new_category = line[1]
                        URL_list.append(new_URL)
                        update_lists(new_URL, new_category, parent_category, file2)
                        file1.write(new_URL+"\tQueued\t"+new_category+"\t"+str(level+1)+"\n")
                        file1.flush()
                        n_URLs += 1

            # scraping Type-2 page (final directory node)

            if n_URLs == n_URLs_old:

                for line in page2:
                    line = line.split("</a>")
                    line = line[0].split("\">")
                    if validate(line[0]) and len(line) > 1:
                        new_URL = URL_base2 + line[0]
                        new_category = line[1]
                        update_lists(new_URL, new_category, parent_category, file2)
                        file1.write(new_URL+"\tEndNode\t"+new_category+"\t"+str(level+1)+"\n")
                        file1.flush()
                        final_URLs[new_URL] = (new_category, parent_category, level+1)

file1.close()
file2.close()

# save list of final URLs to use in step [2]

count = 0
file = open("list_final_URLs.txt","w",encoding="utf-8")
for URL in final_URLs:
    count += 1
    file.write(str(count)+"\t"+URL+"\t"+str(final_URLs[URL])+"\t\n")
file.close()
print()


#---[2] Extracting content from final URLs

# file_log + file_input allows you to resume from where it stopped (in case of crash)

file_input = open("list_final_URLs.txt","r",encoding="utf-8")
file_log = open("crawl_content_log.txt","w",encoding="utf-8")

begin = 1
end = 500
file_output = open("crawl_final_"+str(begin)+"_"+str(end)+".txt","w",encoding="utf-8")
separator = "\t~"

Lines = file_input.readlines()
file_input.close()
n = len(Lines)      # number of final URLs (final pages)

for line in Lines:

    line = line.split("\t")
    count = int(line[0])
    URL = line[1]
    category = line[2]
    category.replace('\n','')

    if count >= begin and count <= end:
        print("Page count: %d/%d %s" %(count, n, URL))
        resp = requests.get(URL, timeout=5)
        if resp.status_code == 200:
            # add page content: one line per page in the output file
            page = resp.text
            page = page.replace('\n', ' ')
            file_output.write(URL+"\t"+category+separator+page+"\t\n")
            file_output.flush()
        else:
            print("Error:", resp.status_code)
        file_log.write(str(count)+"\t"+URL+"\t\n")
        file_log.flush()
        time.sleep(2.5)  #  slow down crawling to avoid being blocked

file_log.close()
file_output.close()