Skip to content

Commit e224ba7

Browse files
committed
big update to get Java stability test going. compares misclassifications now not whitespace edit distance. Better token analysis info tracking too. Some refactoring concerning comments. It was spitting out too many newlines.
1 parent 4bd245f commit e224ba7

File tree

9 files changed

+117
-78
lines changed

9 files changed

+117
-78
lines changed

java/src/org/antlr/codebuff/CollectFeatures.java

Lines changed: 38 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ public class CollectFeatures {
9696
public static final int INDEX_INFO_CHARPOS = 17;
9797

9898
public static final int NUM_FEATURES = 18;
99+
public static final int ANALYSIS_START_TOKEN_INDEX = 1; // we use current and previous token in context so can't start at index 0
99100

100101
public static FeatureMetaData[] FEATURES_INJECT_WS = { // inject ws or nl
101102
new FeatureMetaData(FeatureType.TOKEN, new String[] {"", "LT(-1)"}, 1),
@@ -186,7 +187,7 @@ public CollectFeatures(InputDocument doc, int tabSize, Map<String, List<Pair<Int
186187

187188
public void computeFeatureVectors() {
188189
List<Token> realTokens = getRealTokens(tokens);
189-
for (int i = 2; i<realTokens.size(); i++) { // can't process first 2 tokens
190+
for (int i = ANALYSIS_START_TOKEN_INDEX; i<realTokens.size(); i++) { // can't process first token
190191
int tokenIndexInStream = realTokens.get(i).getTokenIndex();
191192
computeFeatureVectorForToken(tokenIndexInStream);
192193
}
@@ -243,8 +244,6 @@ else if ( ws>0 ) {
243244
public int getAlignmentCategory(TerminalNode node, Token curToken, int columnDelta) {
244245
int aligned = CAT_NO_ALIGNMENT;
245246

246-
ParserRuleContext parent = (ParserRuleContext)node.getParent();
247-
248247
// at a newline, are we aligned with a prior sibling (in a list) etc...
249248
ParserRuleContext earliestLeftAncestor = earliestAncestorStartingWithToken(node, curToken);
250249
Pair<ParserRuleContext, Integer> pair =
@@ -276,14 +275,46 @@ else if ( columnDelta!=0 ) {
276275

277276
public static int getPrecedingNL(CommonTokenStream tokens, int i) {
278277
int precedingNL = 0;
279-
List<Token> wsTokensBeforeCurrentToken = tokens.getHiddenTokensToLeft(i);
280-
if ( wsTokensBeforeCurrentToken==null ) return 0;
281-
for (Token t : wsTokensBeforeCurrentToken) {
282-
precedingNL += Tool.count(t.getText(), '\n');
278+
List<Token> previousWS = getPreviousWS(tokens, i);
279+
if ( previousWS!=null ) {
280+
for (Token ws : previousWS) {
281+
precedingNL += Tool.count(ws.getText(), '\n');
282+
}
283283
}
284284
return precedingNL;
285285
}
286286

287+
// if we have non-ws tokens like comments, we only count ws after last comment
288+
public static List<Token> getPreviousWS(CommonTokenStream tokens, int i) {
289+
List<Token> hiddenTokensToLeft = tokens.getHiddenTokensToLeft(i);
290+
if ( hiddenTokensToLeft==null ) return null;
291+
if ( hasCommentToken(hiddenTokensToLeft) ) {
292+
for (int j = hiddenTokensToLeft.size()-1; j>=0; j--) {
293+
Token hidden = hiddenTokensToLeft.get(j);
294+
String hiddenText = hidden.getText();
295+
if ( !hiddenText.matches("\\s+") ) {
296+
return hiddenTokensToLeft.subList(j+1, hiddenTokensToLeft.size());
297+
}
298+
}
299+
return null;
300+
}
301+
else {
302+
return hiddenTokensToLeft;
303+
}
304+
}
305+
306+
public static boolean hasCommentToken(List<Token> hiddenTokensToLeft) {
307+
boolean hasComment = false;
308+
for (Token hidden : hiddenTokensToLeft) {
309+
String hiddenText = hidden.getText();
310+
if ( !hiddenText.matches("\\s+") ) {
311+
hasComment = true;
312+
break;
313+
}
314+
}
315+
return hasComment;
316+
}
317+
287318
/** Walk upwards from node while p.start == token; return null if there is
288319
* no ancestor starting at token.
289320
*/

java/src/org/antlr/codebuff/FeatureMetaDataTweaker.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,7 @@ public static void main(String[] args) throws Exception {
184184
corpusDir = "../samples/stringtemplate4/org/stringtemplate/v4/compiler/";
185185
testFileDir = "../samples/stringtemplate4/org/stringtemplate/v4/compiler/";
186186
}
187-
Corpus corpus = Tool.train(corpusDir, ".*\\.java", JavaLexer.class, JavaParser.class, "compilationUnit", tabSize);
187+
Corpus corpus = Tool.train(corpusDir, ".*\\.java", JavaLexer.class, JavaParser.class, "compilationUnit", tabSize, true);
188188

189189
List<String> allFiles = Tool.getFilenames(new File(testFileDir), ".*\\.java");
190190
ArrayList<InputDocument> documents = (ArrayList<InputDocument>) Tool.load(allFiles, JavaLexer.class, tabSize);

java/src/org/antlr/codebuff/Formatter.java

Lines changed: 19 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
package org.antlr.codebuff;
22

3-
import com.google.common.base.CharMatcher;
43
import org.antlr.v4.runtime.CommonToken;
54
import org.antlr.v4.runtime.CommonTokenStream;
65
import org.antlr.v4.runtime.ParserRuleContext;
@@ -103,7 +102,7 @@ public String format() {
103102

104103

105104
realTokens = getRealTokens(tokens);
106-
for (int i = 2; i<realTokens.size(); i++) { // can't process first 2 tokens
105+
for (int i = CollectFeatures.ANALYSIS_START_TOKEN_INDEX; i<realTokens.size(); i++) { // can't process first 2 tokens
107106
int tokenIndexInStream = realTokens.get(i).getTokenIndex();
108107
processToken(i, tokenIndexInStream);
109108
}
@@ -130,13 +129,12 @@ public void processToken(int indexIntoRealTokens, int tokenIndexInStream) {
130129
}
131130
else if ( (injectNL_WS&0xFF)==CAT_INJECT_WS ) {
132131
ws = CollectFeatures.unwscat(injectNL_WS);
132+
if ( ws==0 && cannotJoin(realTokens.get(indexIntoRealTokens-1), curToken) ) { // failsafe!
133+
ws = 1;
134+
}
133135
}
134136

135-
if ( ws==0 && cannotJoin(realTokens.get(indexIntoRealTokens-1), curToken) ) { // failsafe!
136-
ws = 1;
137-
}
138-
139-
int align = CAT_NO_ALIGNMENT;
137+
int alignOrIndent = CAT_NO_ALIGNMENT;
140138

141139
if ( newlines>0 ) {
142140
output.append(Tool.newlines(newlines));
@@ -156,17 +154,17 @@ else if ( (injectNL_WS&0xFF)==CAT_INJECT_WS ) {
156154
// if we decide to inject a newline, we better recompute this value before classifying alignment
157155
features[INDEX_MATCHING_TOKEN_DIFF_LINE] = getMatchingSymbolOnDiffLine(doc, node, line);
158156

159-
align = alignClassifier.classify(k, features, corpus.align, MAX_CONTEXT_DIFF_THRESHOLD);
157+
alignOrIndent = alignClassifier.classify(k, features, corpus.align, MAX_CONTEXT_DIFF_THRESHOLD);
160158

161-
if ( align==CAT_INDENT ) {
159+
if ( alignOrIndent==CAT_INDENT ) {
162160
if ( firstTokenOnPrevLine!=null ) { // if not on first line, we cannot indent
163161
int indentedCol = firstTokenOnPrevLine.getCharPositionInLine()+INDENT_LEVEL;
164162
charPosInLine = indentedCol;
165163
output.append(Tool.spaces(indentedCol));
166164
}
167165
}
168-
else if ( (align&0xFF)==CAT_ALIGN_WITH_ANCESTOR_CHILD ) {
169-
int[] deltaChild = CollectFeatures.unaligncat(align);
166+
else if ( (alignOrIndent&0xFF)==CAT_ALIGN_WITH_ANCESTOR_CHILD ) {
167+
int[] deltaChild = CollectFeatures.unaligncat(alignOrIndent);
170168
int deltaFromAncestor = deltaChild[0];
171169
int childIndex = deltaChild[1];
172170
ParserRuleContext earliestLeftAncestor = earliestAncestorStartingWithToken(node, curToken);
@@ -181,16 +179,16 @@ else if ( child instanceof TerminalNode ){
181179
}
182180
else {
183181
// uh oh.
184-
System.err.println("Whoops. Tried access invalid child");
182+
System.err.println("Whoops. Tried to access invalid child");
185183
}
186184
if ( start!=null ) {
187185
int indentCol = start.getCharPositionInLine();
188186
charPosInLine = indentCol;
189187
output.append(Tool.spaces(indentCol));
190188
}
191189
}
192-
else if ( (align&0xFF)==CAT_INDENT_FROM_ANCESTOR_FIRST_TOKEN ) {
193-
int deltaFromAncestor = CollectFeatures.unindentcat(align);
190+
else if ( (alignOrIndent&0xFF)==CAT_INDENT_FROM_ANCESTOR_FIRST_TOKEN ) {
191+
int deltaFromAncestor = CollectFeatures.unindentcat(alignOrIndent);
194192
ParserRuleContext earliestLeftAncestor = earliestAncestorStartingWithToken(node, curToken);
195193
ParserRuleContext ancestor = CollectFeatures.getAncestor(earliestLeftAncestor, deltaFromAncestor);
196194
Token start = ancestor.getStart();
@@ -206,7 +204,7 @@ else if ( (align&0xFF)==CAT_INDENT_FROM_ANCESTOR_FIRST_TOKEN ) {
206204
}
207205

208206
TokenPositionAnalysis tokenPositionAnalysis =
209-
getTokenAnalysis(features, indexIntoRealTokens, tokenIndexInStream, newlines, align, ws);
207+
getTokenAnalysis(features, indexIntoRealTokens, tokenIndexInStream, injectNL_WS, alignOrIndent);
210208
analysis.setSize(tokenIndexInStream+1);
211209
analysis.set(tokenIndexInStream, tokenPositionAnalysis);
212210

@@ -235,14 +233,7 @@ public void emitCommentsToTheLeft(int tokenIndexInStream) {
235233
List<Token> hiddenTokensToLeft = tokens.getHiddenTokensToLeft(tokenIndexInStream);
236234
if ( hiddenTokensToLeft!=null ) {
237235
// if at least one is not whitespace, assume it's a comment and print all hidden stuff including whitespace
238-
boolean hasComment = false;
239-
for (Token hidden : hiddenTokensToLeft) {
240-
String hiddenText = hidden.getText();
241-
if ( !hiddenText.matches("\\s+") ) {
242-
hasComment = true;
243-
break;
244-
}
245-
}
236+
boolean hasComment = CollectFeatures.hasCommentToken(hiddenTokensToLeft);
246237
if ( hasComment ) {
247238
// avoid whitespace at end of sequence as we'll inject that
248239
int last = -1;
@@ -259,7 +250,7 @@ public void emitCommentsToTheLeft(int tokenIndexInStream) {
259250
String hiddenText = hidden.getText();
260251
output.append(hiddenText);
261252
if ( hiddenText.matches("\\n+") ) {
262-
line += CharMatcher.is('\n').countIn(hiddenText);
253+
line += Tool.count(hiddenText, '\n');
263254
charPosInLine = 0;
264255
}
265256
else {
@@ -272,28 +263,20 @@ public void emitCommentsToTheLeft(int tokenIndexInStream) {
272263
}
273264

274265
public TokenPositionAnalysis getTokenAnalysis(int[] features, int indexIntoRealTokens, int tokenIndexInStream,
275-
int injectNewline,
276-
int align,
277-
int ws)
266+
int injectNL_WS, int alignOrIndent)
278267
{
279268
CommonToken curToken = (CommonToken)tokens.get(tokenIndexInStream);
280269
// compare prediction of newline against original, alert about any diffs
281270
CommonToken prevToken = originalTokens.get(curToken.getTokenIndex()-1);
282271
CommonToken originalCurToken = originalTokens.get(curToken.getTokenIndex());
283272

284-
boolean failsafeTriggered = false;
285-
if ( ws==0 && cannotJoin(realTokens.get(indexIntoRealTokens-1), curToken) ) { // failsafe!
286-
ws = 1;
287-
failsafeTriggered = true;
288-
}
289-
290273
boolean prevIsWS = prevToken.getChannel()==Token.HIDDEN_CHANNEL; // assume this means whitespace
291274
int actualNL = Tool.count(prevToken.getText(), '\n');
292275
String newlinePredictionString = String.format("### line %d: predicted %d \\n actual ?",
293-
originalCurToken.getLine(), injectNewline, prevIsWS ? actualNL : "none");
276+
originalCurToken.getLine(), injectNL_WS, prevIsWS ? actualNL : "none");
294277
String alignPredictionString = String.format("### line %d: predicted %d actual %s",
295278
originalCurToken.getLine(),
296-
align,
279+
alignOrIndent,
297280
"?");
298281

299282
String newlineAnalysis = newlinePredictionString+"\n"+
@@ -302,7 +285,7 @@ public TokenPositionAnalysis getTokenAnalysis(int[] features, int indexIntoRealT
302285
String alignAnalysis =alignPredictionString+"\n"+
303286
alignClassifier.getPredictionAnalysis(doc, k, features, corpus.align,
304287
MAX_CONTEXT_DIFF_THRESHOLD);
305-
return new TokenPositionAnalysis(newlineAnalysis, alignAnalysis, "n/a");
288+
return new TokenPositionAnalysis(curToken, injectNL_WS, newlineAnalysis, alignOrIndent, alignAnalysis);
306289
}
307290

308291
/** Do not join two words like "finaldouble" or numbers like "3double",

java/src/org/antlr/codebuff/Optimizer.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,7 @@ public static void main(String[] args) throws Exception {
162162
corpusDir = "../samples/stringtemplate4/org/stringtemplate/v4/debug/";
163163
testFileDir = "../samples/stringtemplate4/org/stringtemplate/v4/debug/";
164164
}
165-
Corpus corpus = Tool.train(corpusDir, ".*\\.java", JavaLexer.class, JavaParser.class, "compilationUnit", tabSize);
165+
Corpus corpus = Tool.train(corpusDir, ".*\\.java", JavaLexer.class, JavaParser.class, "compilationUnit", tabSize, true);
166166

167167
List<String> allFiles = Tool.getFilenames(new File(testFileDir), ".*\\.java");
168168
ArrayList<InputDocument> documents = (ArrayList<InputDocument>) Tool.load(allFiles, JavaLexer.class, tabSize);
Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,21 @@
11
package org.antlr.codebuff;
22

3+
import org.antlr.v4.runtime.Token;
4+
35
public class TokenPositionAnalysis {
4-
public int charIndexStart; // where in output buffer the associated token starts; used to respond to clicks in formatted text
5-
public int charIndexStop; // stop index (inclusive)
6-
public String newline = "n/a";
7-
public String ws = "n/a";
8-
public String align = "n/a";
6+
public Token t; // token from the input stream; it's position will usually differ from charIndexStart etc...
7+
public int charIndexStart; // where in *output* buffer the associated token starts; used to respond to clicks in formatted text
8+
public int charIndexStop; // stop index (inclusive)
9+
public int ws;
10+
public int align;
11+
public String wsAnalysis = "n/a";
12+
public String alignAnalysis = "n/a";
913

10-
public TokenPositionAnalysis(String newline, String align, String ws) {
11-
this.align = align;
12-
this.newline = newline;
14+
public TokenPositionAnalysis(Token t, int ws, String wsAnalysis, int align, String alignAnalysis) {
15+
this.t = t;
1316
this.ws = ws;
17+
this.wsAnalysis = wsAnalysis;
18+
this.align = align;
19+
this.alignAnalysis = alignAnalysis;
1420
}
1521
}

java/src/org/antlr/codebuff/Tool.java

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
*
3333
* Tool -antlr grammars /Users/parrt/antlr/code/grammars-v4/clojure/Clojure.g4
3434
* Tool -java ../samples/stringtemplate4 src/org/antlr/codebuff/Tool.java
35+
* Tool -java ../samples/stringtemplate4 ../samples/stringtemplate4/org/stringtemplate/v4/AutoIndentWriter.java
3536
*/
3637
public class Tool {
3738
public static boolean showFileNames = false;
@@ -49,7 +50,7 @@ public static void main(String[] args)
4950
String testFilename = args[2];
5051
String output;
5152
if ( language.equals("-java") ) {
52-
Corpus corpus = train(corpusDir, ".*\\.java", JavaLexer.class, JavaParser.class, "compilationUnit", tabSize);
53+
Corpus corpus = train(corpusDir, ".*\\.java", JavaLexer.class, JavaParser.class, "compilationUnit", tabSize, true);
5354
InputDocument testDoc = load(testFilename, JavaLexer.class, tabSize);
5455
Pair<String,List<TokenPositionAnalysis>> results = format(corpus, testDoc, JavaLexer.class, JavaParser.class, "compilationUnit", tabSize);
5556
output = results.a;
@@ -58,7 +59,7 @@ public static void main(String[] args)
5859
controller.show();
5960
}
6061
else {
61-
Corpus corpus = train(corpusDir, ".*\\.g4", ANTLRv4Lexer.class, ANTLRv4Parser.class, "grammarSpec", tabSize);
62+
Corpus corpus = train(corpusDir, ".*\\.g4", ANTLRv4Lexer.class, ANTLRv4Parser.class, "grammarSpec", tabSize, true);
6263
InputDocument testDoc = load(testFilename, ANTLRv4Lexer.class, tabSize);
6364
Pair<String,List<TokenPositionAnalysis>> results = format(corpus, testDoc, ANTLRv4Lexer.class, ANTLRv4Parser.class, "grammarSpec", tabSize);
6465
output = results.a;
@@ -110,7 +111,8 @@ public static Corpus train(String rootDir,
110111
Class<? extends Lexer> lexerClass,
111112
Class<? extends Parser> parserClass,
112113
String startRuleName,
113-
int tabSize)
114+
int tabSize,
115+
boolean shuffleFeatureVectors)
114116
throws Exception
115117
{
116118
List<String> allFiles = getFilenames(new File(rootDir), fileRegex);
@@ -143,7 +145,7 @@ public static Corpus train(String rootDir,
143145
}
144146

145147
Corpus corpus = processSampleDocs(documents, lexerClass, parserClass, tabSize, ruleToPairsBag);
146-
corpus.randomShuffleInPlace();
148+
if ( shuffleFeatureVectors ) corpus.randomShuffleInPlace();
147149
corpus.buildTokenContextIndex();
148150
return corpus;
149151
}

java/src/org/antlr/codebuff/gui/GUIController.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -109,9 +109,9 @@ public void caretUpdate(CaretEvent e) {
109109
if ( analysis!=null ) {
110110
highlighter.addHighlight(analysis.charIndexStart, analysis.charIndexStop+1, painter);
111111
}
112-
scope.injectNLConsole.setText(analysis!=null ? analysis.newline : "");
113-
scope.alignConsole.setText(analysis!=null ? analysis.align : "");
112+
scope.injectNLConsole.setText(analysis!=null ? analysis.wsAnalysis : "");
114113
scope.injectNLConsole.setCaretPosition(0);
114+
scope.alignConsole.setText(analysis!=null ? analysis.alignAnalysis : "");
115115
scope.alignConsole.setCaretPosition(0);
116116
}
117117
catch (Exception ex) {

java/src/org/antlr/codebuff/kNNClassifier.java

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -28,20 +28,6 @@ public kNNClassifier(Corpus corpus, FeatureMetaData[] FEATURES) {
2828
maxDistanceCount = n;
2929
}
3030

31-
/** Classify unknown for all Y at once */
32-
public int[] classify(int k, int[] unknown, double distanceThreshold) {
33-
int[] categories = new int[Corpus.NUM_DEPENDENT_VARS];
34-
35-
Neighbor[] kNN = kNN(unknown, k, distanceThreshold);
36-
HashBag<Integer> votesBag = getVotesBag(kNN, k, unknown, corpus.injectWhitespace);
37-
categories[Corpus.INDEX_FEATURE_NEWLINES] = getCategoryWithMostVotes(votesBag);
38-
39-
votesBag = getVotesBag(kNN, k, unknown, corpus.align);
40-
categories[Corpus.INDEX_FEATURE_ALIGN_WITH_PREVIOUS] = getCategoryWithMostVotes(votesBag);
41-
42-
return categories;
43-
}
44-
4531
/**
4632
* Walk all training samples and compute distance(). Return indexes of k
4733
* smallest distance values. Categories can be any negative or positive

0 commit comments

Comments
 (0)