Skip to content

Commit efef52a

Browse files
committed
Added documentation to relevant functions
1 parent e5c17f8 commit efef52a

File tree

3 files changed

+188
-22
lines changed

3 files changed

+188
-22
lines changed

code_diff/__init__.py

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,46 @@
1010
# Main method --------------------------------------------------------
1111

1212
def difference(source, target, lang = "guess", **kwargs):
13+
"""
14+
Computes the smallest difference between source and target
15+
16+
Computes the smallest code difference between the given
17+
code snippets. Difference is computed by a simulteanous
18+
walk over the ASTs of the given code snippets. Returned
19+
will be the smallest code snippet that represent
20+
the first AST node found to be different.
21+
22+
Parameters
23+
----------
24+
source : str
25+
Source code which should be compared
26+
27+
target : str
28+
Comparison target as a code string
29+
30+
lang : [python, java, javascript, ...]
31+
Programming language which should be used
32+
to parse the code snippets.
33+
Default: guess (Currently not supported, will throw error)
34+
35+
syntax_error : [raise, warn, ignore]
36+
Strategy to handle syntax errors in code.
37+
To parse incomplete code snippets, 'ignore' should
38+
be selected to silent any warning.
39+
Default: raise (Raises an exception)
40+
41+
**kwargs : dict
42+
Further config option that are specific to
43+
the underlying AST parser. See code_tokenize
44+
for more infos.
45+
46+
Returns
47+
-------
48+
ASTDiff
49+
The smallest code change necessary
50+
to convert the source code into the target code.
51+
52+
"""
1353

1454
config = load_from_lang_config(lang, **kwargs)
1555
source_ast = parse_ast(source, lang = lang, **kwargs)
@@ -56,6 +96,56 @@ def diff_search(source_ast, target_ast):
5696
# AST Difference --------------------------------------------------------
5797

5898
class ASTDiff:
99+
"""
100+
Difference between two code snippets
101+
102+
This object represents the smallest code change
103+
necessary to transform a source code snippet
104+
into a target code.
105+
106+
Attributes
107+
----------
108+
is_single_statement : bool
109+
Whether the code difference only affect a single program statement
110+
111+
source_ast : ASTNode
112+
AST node related to the code change
113+
114+
source_text : str
115+
Source code which have to be changed
116+
117+
target_ast : ASTNode
118+
AST node which is different to the source AST
119+
120+
target_text : str
121+
Target text for converting source to target
122+
123+
Methods
124+
-------
125+
edit_script : list[EditOp]
126+
Computes a sequence of AST operations which need
127+
to be performed to translate source code in target code
128+
129+
Note: We balance performance and precision by computing
130+
the AST edit script at the current diff level. The
131+
algorithm runs the fastest on the smallest diff level
132+
but is also most imprecise. To achieve the highest precision,
133+
the root_diff should be used.
134+
135+
sstub_pattern : SStuBPattern
136+
Categorizes the current diff into one of 20 SStuB categories.
137+
Note: Currently, this operation is only supported for
138+
Python code. Running the function on code in another language
139+
will cause an exception.
140+
141+
statement_diff : ASTDiff
142+
raises the AST difference to the statement level
143+
144+
root_diff : ASTDiff
145+
raises the AST difference to the root level (of each code snippet)
146+
147+
148+
"""
59149

60150
def __init__(self, config, source_ast, target_ast):
61151
self.config = config
@@ -88,6 +178,8 @@ def root_diff(self):
88178
return ASTDiff(self.config, ast_root(self.source_ast), ast_root(self.target_ast))
89179

90180
def sstub_pattern(self):
181+
if self.config.lang != "python":
182+
raise ValueError("SStuB can currently only be computed for Python code.")
91183

92184
if (parent_statement(self.config.statement_types, self.source_ast) is None
93185
or parent_statement(self.config.statement_types, self.target_ast) is None):

code_diff/ast.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,40 @@
66

77

88
class ASTNode(object):
9+
"""
10+
A representation of an AST node together with its children
11+
12+
Node Attributes
13+
---------------
14+
type : str
15+
Syntactic type of the AST node
16+
17+
text : str
18+
If this node belongs to a program token, then
19+
it contains the text of the program token. Otherwise, None.
20+
21+
children : list[ASTNode]
22+
Potenially empty list of child nodes
23+
24+
position : int
25+
If supported, the code position that is referenced by the AST node
26+
27+
parent : ASTNode
28+
If not root node, the AST parent of this node.
29+
30+
Subtree Attributes
31+
------------------
32+
subtree_hash : str
33+
A hash string representing the subtree of the AST node
34+
Two subtrees are isomorph if they have the same subtree hash.
35+
36+
subtree_height : int
37+
Longest path from this node to a leaf node
38+
39+
subtree_weight : int
40+
Count of all nodes in this subtree
41+
42+
"""
943

1044
def __init__(self, type, text = None, position = None, parent = None, children = None):
1145

@@ -264,6 +298,30 @@ def __call__(self, tokens):
264298

265299

266300
def parse_ast(source_code, lang = "guess", **kwargs):
301+
"""
302+
Parses a given source code string into its AST
303+
304+
Function to parse source code in the given language
305+
into its AST. As a backend, we employ
306+
code_tokenize (tree-sitter). The final
307+
AST is additionally analyzed to compute
308+
additional annotations
309+
310+
Parameters
311+
----------
312+
source_code : str
313+
Source code snippet as a string
314+
315+
lang : [python, java, javascript, ...]
316+
Language to parse the given source code
317+
Default: guess (Currently not supported; will raise error)
318+
319+
Returns
320+
-------
321+
ASTNode
322+
the root node of the computed AST
323+
324+
"""
267325

268326
# Parse AST
269327
kwargs["lang"] = lang

code_diff/sstubs.py

Lines changed: 38 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
from enum import Enum
2-
import re
32

43
class SStubPattern(Enum):
54

@@ -62,7 +61,9 @@ def classify_sstub(source_ast, target_ast):
6261
if source_name == target_name:
6362
classifier_fns.append(same_function_mod)
6463

65-
if _query_path(source_ast, "if_statement", "condition") or _query_path(source_ast, "elif_clause", "condition"):
64+
if (_query_path(source_ast, "if_statement", "condition")
65+
or _query_path(source_ast, "elif_clause", "condition")
66+
or _query_path(source_ast, "while_statement", "condition")):
6667
classifier_fns.append(change_if_statement)
6768

6869
if source_ast.type in ["tuple", "list", "dictionary", "set"]:
@@ -119,15 +120,15 @@ def pisomorph(A, B):
119120

120121
# Binary operand ----------------------------------------------------------------
121122

122-
def _is_binary_operand(source_ast, target_ast):
123-
return _query_path(source_ast, "binary_operator", "left") or _query_path(source_ast, "binary_operator", "right")
124-
125-
def is_boolean_operand(source_ast, target_ast):
126-
return _query_path(source_ast, "boolean_operator", "left") or _query_path(source_ast, "boolean_operator", "right")
127-
128123

129124
def is_binary_operand(source_ast, target_ast):
130-
return _is_binary_operand(source_ast, target_ast) or is_boolean_operand(source_ast, target_ast)
125+
126+
for bin_op_type in ["binary_operator", "comparison_operator", "boolean_operator"]:
127+
for direction in ["left", "right"]:
128+
if (_query_path(source_ast, bin_op_type, direction, depth = 1)):
129+
return True
130+
131+
return False
131132

132133

133134

@@ -218,16 +219,19 @@ def change_attribute_used(source_ast, target_ast):
218219

219220

220221
def change_identifier_used(source_ast, target_ast):
221-
return source_ast.type == "identifier"
222+
223+
# Following ManySStuBs we ignore the following Method declaration, Class Declaration, Variable Declaration
224+
if any(x in source_ast.parent.type for x in ["definition", "declaration"]):
225+
return False
226+
227+
return source_ast.type == "identifier" and target_ast.type == "identifier"
222228

223229

224230
def change_binary_operator(source_ast, target_ast):
225231

226-
for operator in ["binary_operator", "boolean_operator", "comparison_operator"]:
227-
if _query_path(source_ast, operator, "*", depth = 1):
228-
if (not _query_path(source_ast, operator, "left", depth = 1)
229-
and not _query_path(source_ast, operator, "right", depth = 1)):
230-
return True
232+
if source_ast.parent.type in ["binary_operator", "boolean_operator", "comparison_operator"]:
233+
bin_op = source_ast.parent
234+
return bin_op.children[1] == source_ast
231235

232236
return False
233237

@@ -332,12 +336,16 @@ def same_function_swap_args(source_ast, target_ast):
332336
if len(source_ast.children) != len(target_ast.children):
333337
return False
334338

335-
arguments = source_ast.children
336-
for arg in arguments:
337-
if not any(pisomorph(t, arg) for t in target_ast.children):
338-
return False
339+
src_arguments = source_ast.children
340+
target_arguments = target_ast.children
339341

340-
return True
342+
diff_args = [i for i, src_arg in enumerate(src_arguments) if not pisomorph(src_arg, target_arguments[i])]
343+
344+
if len(diff_args) != 2: return False
345+
346+
swap_0, swap_1 = diff_args
347+
return (pisomorph(src_arguments[swap_0], target_arguments[swap_1])
348+
and pisomorph(src_arguments[swap_1], target_arguments[swap_0]))
341349

342350

343351
same_function_edits = {
@@ -364,11 +372,18 @@ def same_function_mod(source_ast, target_ast):
364372

365373

366374
def more_specific_if(source_ast, target_ast):
375+
376+
if not target_ast.type == "boolean_operator": return False
377+
if target_ast.children[1].type != "and" : return False
378+
367379
return any(pisomorph(c, source_ast) for c in target_ast.children)
368380

369381

370382
def less_specific_if(source_ast, target_ast):
371-
return any(pisomorph(c, target_ast) for c in source_ast.children)
383+
if not target_ast.type == "boolean_operator": return False
384+
if target_ast.children[1].type != "or" : return False
385+
386+
return any(pisomorph(c, source_ast) for c in target_ast.children)
372387

373388

374389
def change_if_statement(source_ast, target_ast):
@@ -413,7 +428,8 @@ def add_function_around_expression(source_ast, target_ast):
413428
if argument_list.type != "argument_list":
414429
return False
415430

416-
if len(argument_list.children) != 3: return False
431+
# It seems that adding arguments together with a function seems to be okay (see PySStuBs dataset)
432+
#if len(argument_list.children) != 3: return False
417433

418434
for arg in argument_list.children:
419435
if pisomorph(arg, source_ast):

0 commit comments

Comments
 (0)