1
+ import os
2
+ import argparse
3
+ import tarfile
4
+ import json
5
+ import gzip
6
+
7
+ from glob import glob
8
+ from tqdm import tqdm
9
+
10
+ import multiprocessing as mp
11
+
12
+ import code_diff as cd
13
+ from code_diff .diff_utils import parse_hunks , clean_hunk
14
+
15
+ def load_slcs_from_tar (tar_files ):
16
+
17
+ for tar_file in tar_files :
18
+ tar = tarfile .open (tar_file , "r:gz" )
19
+
20
+ for tarinfo in tar :
21
+ if not tarinfo .isfile (): continue
22
+ with tar .extractfile (tarinfo ) as lines :
23
+ for line in lines :
24
+ yield json .loads (line .decode ("utf-8" ))
25
+
26
+
27
+ def create_output (slc , ** kwargs ):
28
+
29
+ output = {
30
+ "project" : slc ["project" ],
31
+ "commit_sha" : slc ["commit_sha" ],
32
+ "parent_sha" : slc ["parent_sha" ],
33
+ "file_path" : slc ["file_path" ],
34
+ "project_url" : slc ["project_url" ],
35
+
36
+ "likely_bug" : slc ["likely_bug" ],
37
+ "comodified" : slc ["comodified" ],
38
+ "in_function" : slc ["in_function" ],
39
+
40
+ "diff" : slc ["diff" ],
41
+ }
42
+
43
+ output .update (kwargs )
44
+ return output
45
+
46
+
47
+ def process_slc (slc ):
48
+
49
+ diff_message = slc ["diff" ]
50
+ diff_hunks = parse_hunks (diff_message )
51
+
52
+ diff_candidates = []
53
+ for hunk in diff_hunks :
54
+ hunk = clean_hunk (hunk )
55
+
56
+ try :
57
+ diff = cd .difference (hunk .before , hunk .after , lang = "python" )
58
+ diff_candidates .append (diff )
59
+ except ValueError :
60
+ continue
61
+
62
+ if len (diff_candidates ) != 1 : return None
63
+
64
+ source_diff = diff_candidates [0 ]
65
+
66
+ sstub_pattern = source_diff .sstub_pattern ().name
67
+ edit_script = str (source_diff .edit_script ())
68
+
69
+ try :
70
+ statement_diff = source_diff .statement_diff ()
71
+ before = statement_diff .source_text
72
+ after = statement_diff .target_text
73
+ except ValueError :
74
+ before = source_diff .source_text
75
+ after = source_diff .target_text
76
+
77
+ return create_output (slc ,
78
+ before = before ,
79
+ after = after ,
80
+ sstub_pattern = sstub_pattern ,
81
+ edit_script = edit_script )
82
+
83
+
84
+ # Save to jsonl.gz
85
+
86
+ class JsonlGzSaver :
87
+
88
+ def __init__ (self , save_dir , num_objects = 1e5 ):
89
+ self .save_dir = save_dir
90
+ self .num_objects = num_objects
91
+
92
+ self .object_count = 0
93
+ self .file_count = 0
94
+
95
+ self .file_handler = None
96
+ self ._update_handler ()
97
+
98
+ def _update_handler (self ):
99
+
100
+ need_update = self .file_handler is None or self .object_count >= self .num_objects
101
+ if not need_update : return
102
+
103
+ file_path = os .path .join (self .save_dir , "file-%d.jsonl" % self .file_count )
104
+
105
+ if self .file_handler is not None : self .file_handler .close ()
106
+
107
+ self .file_handler = open (file_path , "wb" )
108
+ self .file_count += 1
109
+ self .object_count = 0
110
+
111
+ def save (self , obj ):
112
+ json_obj = json .dumps (obj ) + "\n "
113
+ self .file_handler .write (json_obj .encode ("utf-8" ))
114
+ self .object_count += 1
115
+ self ._update_handler ()
116
+
117
+ def close (self ):
118
+ if self .file_handler is not None :
119
+ self .file_handler .close ()
120
+ self .file_handler = None
121
+
122
+ # Multiprocessing --------------------------------
123
+
124
+ def pmap (map_fn , data ):
125
+
126
+ cpu_count = mp .cpu_count ()
127
+
128
+ if cpu_count <= 4 : # To few CPUs for multiprocessing
129
+ for output in map (map_fn , data ):
130
+ yield output
131
+
132
+ with mp .Pool (processes = cpu_count ) as pool :
133
+ for output in pool .imap_unordered (map_fn , data ):
134
+ yield output
135
+
136
+
137
+ def main ():
138
+ parser = argparse .ArgumentParser ()
139
+ parser .add_argument ("input_dir" )
140
+ parser .add_argument ("output_dir" )
141
+
142
+ args = parser .parse_args ()
143
+
144
+ tar_files = glob (os .path .join (args .input_dir , "*.tar.gz" ))
145
+
146
+ slc_saver = JsonlGzSaver (args .output_dir )
147
+
148
+ try :
149
+ process_map = pmap (process_slc , load_slcs_from_tar (tar_files ))
150
+ for output in tqdm (process_map , total = 66e6 ):
151
+ if output is None : continue
152
+ slc_saver .save (output )
153
+ finally :
154
+ slc_saver .close ()
155
+
156
+
157
+ if __name__ == '__main__' :
158
+ main ()
0 commit comments