Skip to content

Commit a8b87cd

Browse files
author
Cedric Richter
committed
Small script for processing git slcs
1 parent 583570b commit a8b87cd

File tree

4 files changed

+165
-2
lines changed

4 files changed

+165
-2
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,3 +127,6 @@ dmypy.json
127127

128128
# Pyre type checker
129129
.pyre/
130+
131+
#VSCode
132+
.vscode/

code_diff/gumtree/chawathe.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -260,7 +260,7 @@ def apply(self, operation):
260260
self.apply(Delete(insert_node))
261261

262262
wn = self.src._access_wn(insert_node)
263-
self.mod_children.insert(operation.position, wn)
263+
self.children.insert(operation.position, wn)
264264

265265
wn.mod_parent = self
266266

code_diff/sstubs.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -300,7 +300,9 @@ def change_iterable(source_ast, target_ast):
300300

301301
def add_function_around_expression(source_ast, target_ast):
302302
argument_list = target_ast.children[-1]
303-
assert argument_list.type == "argument_list", str(argument_list)
303+
304+
if argument_list.type == "argument_list":
305+
return False
304306

305307
if len(argument_list.children) != 3: return False
306308

run_slc_process.py

Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
import os
2+
import argparse
3+
import tarfile
4+
import json
5+
import gzip
6+
7+
from glob import glob
8+
from tqdm import tqdm
9+
10+
import multiprocessing as mp
11+
12+
import code_diff as cd
13+
from code_diff.diff_utils import parse_hunks, clean_hunk
14+
15+
def load_slcs_from_tar(tar_files):
16+
17+
for tar_file in tar_files:
18+
tar = tarfile.open(tar_file, "r:gz")
19+
20+
for tarinfo in tar:
21+
if not tarinfo.isfile(): continue
22+
with tar.extractfile(tarinfo) as lines:
23+
for line in lines:
24+
yield json.loads(line.decode("utf-8"))
25+
26+
27+
def create_output(slc, **kwargs):
28+
29+
output = {
30+
"project": slc["project"],
31+
"commit_sha": slc["commit_sha"],
32+
"parent_sha": slc["parent_sha"],
33+
"file_path": slc["file_path"],
34+
"project_url": slc["project_url"],
35+
36+
"likely_bug": slc["likely_bug"],
37+
"comodified": slc["comodified"],
38+
"in_function": slc["in_function"],
39+
40+
"diff": slc["diff"],
41+
}
42+
43+
output.update(kwargs)
44+
return output
45+
46+
47+
def process_slc(slc):
48+
49+
diff_message = slc["diff"]
50+
diff_hunks = parse_hunks(diff_message)
51+
52+
diff_candidates = []
53+
for hunk in diff_hunks:
54+
hunk = clean_hunk(hunk)
55+
56+
try:
57+
diff = cd.difference(hunk.before, hunk.after, lang = "python")
58+
diff_candidates.append(diff)
59+
except ValueError:
60+
continue
61+
62+
if len(diff_candidates) != 1: return None
63+
64+
source_diff = diff_candidates[0]
65+
66+
sstub_pattern = source_diff.sstub_pattern().name
67+
edit_script = str(source_diff.edit_script())
68+
69+
try:
70+
statement_diff = source_diff.statement_diff()
71+
before = statement_diff.source_text
72+
after = statement_diff.target_text
73+
except ValueError:
74+
before = source_diff.source_text
75+
after = source_diff.target_text
76+
77+
return create_output(slc,
78+
before = before,
79+
after = after,
80+
sstub_pattern = sstub_pattern,
81+
edit_script = edit_script)
82+
83+
84+
# Save to jsonl.gz
85+
86+
class JsonlGzSaver:
87+
88+
def __init__(self, save_dir, num_objects = 1e5):
89+
self.save_dir = save_dir
90+
self.num_objects = num_objects
91+
92+
self.object_count = 0
93+
self.file_count = 0
94+
95+
self.file_handler = None
96+
self._update_handler()
97+
98+
def _update_handler(self):
99+
100+
need_update = self.file_handler is None or self.object_count >= self.num_objects
101+
if not need_update: return
102+
103+
file_path = os.path.join(self.save_dir, "file-%d.jsonl" % self.file_count)
104+
105+
if self.file_handler is not None: self.file_handler.close()
106+
107+
self.file_handler = open(file_path, "wb")
108+
self.file_count += 1
109+
self.object_count = 0
110+
111+
def save(self, obj):
112+
json_obj = json.dumps(obj) + "\n"
113+
self.file_handler.write(json_obj.encode("utf-8"))
114+
self.object_count += 1
115+
self._update_handler()
116+
117+
def close(self):
118+
if self.file_handler is not None:
119+
self.file_handler.close()
120+
self.file_handler = None
121+
122+
# Multiprocessing --------------------------------
123+
124+
def pmap(map_fn, data):
125+
126+
cpu_count = mp.cpu_count()
127+
128+
if cpu_count <= 4: # To few CPUs for multiprocessing
129+
for output in map(map_fn, data):
130+
yield output
131+
132+
with mp.Pool(processes = cpu_count) as pool:
133+
for output in pool.imap_unordered(map_fn, data):
134+
yield output
135+
136+
137+
def main():
138+
parser = argparse.ArgumentParser()
139+
parser.add_argument("input_dir")
140+
parser.add_argument("output_dir")
141+
142+
args = parser.parse_args()
143+
144+
tar_files = glob(os.path.join(args.input_dir, "*.tar.gz"))
145+
146+
slc_saver = JsonlGzSaver(args.output_dir)
147+
148+
try:
149+
process_map = pmap(process_slc, load_slcs_from_tar(tar_files))
150+
for output in tqdm(process_map, total = 66e6):
151+
if output is None: continue
152+
slc_saver.save(output)
153+
finally:
154+
slc_saver.close()
155+
156+
157+
if __name__ == '__main__':
158+
main()

0 commit comments

Comments
 (0)