diff --git a/tokenizers/javascript/.gitignore b/tokenizers/javascript/.gitignore new file mode 100644 index 000000000..c9e792f77 --- /dev/null +++ b/tokenizers/javascript/.gitignore @@ -0,0 +1,6 @@ +*.swp +*.swo +*.log + +node_modules +.DS_Store diff --git a/tokenizers/javascript/index.js b/tokenizers/javascript/index.js new file mode 100644 index 000000000..7b20f7364 --- /dev/null +++ b/tokenizers/javascript/index.js @@ -0,0 +1,90 @@ +const esprima = require('esprima') +const escodegen = require('escodegen') +const fs = require('fs-extra-promise') +const tokenizer = require('./tokenizer') + +const immutable = require('immutable') +const walk = require('esprima-walk') + +const { base64FileName } = require('./util') + +const estools = require('estools') + +const TOKENIZER_SCOPE_FILE = 'file-scope' +const TOKENIZER_SCOPE_FUNCTION = 'function-scope' + +const TOKENIZER_SCOPE = TOKENIZER_SCOPE_FILE + +// TODO: estools map / filter / traverse (instead of walk) +// - filter subfunctions from fuction asts somehow +// - test on SCC + +// TODO: get rid of the function block and indentation +const regenerateFunctionCode = function(functionAst) { + codegenOptions = { // NOTE: doesn't help + format: { + parentheses: false + } + } + + // NOTE: functionAst.body ommits the function signature (returns block only) + return escodegen.generate(functionAst.body, {}) +} + +const processFile = function(fileName, data) { + //let parentId = base64FileName(fileName) // TODO: incorporate repo name / hash + let parentId = fileName + let blockId = 1 + + if (TOKENIZER_SCOPE === TOKENIZER_SCOPE_FILE) { + return immutable.List.of(tokenizer(data, parentId, blockId)) + } + + options = { + loc: true, + range: true, + comment: true, + attachComment: true + } + fileAst = esprima.parse(data, {}); + + let functions = immutable.List() + let functionTokens = immutable.List() + walk(fileAst, (node) => { + if (node.type == 'FunctionExpression') { + // const functionAstShallow = estools.map(node, (subNode) => { + // if (subNode === undefined || subNode.type === undefined) return + // if (subNode.type == 'FunctionExpression') + // return {} + // else return subNode + // }) + //console.log(functionAstShallow) + //process.exit(1) + const functionAstShallow = node + const functionCode = regenerateFunctionCode(functionAstShallow) + functions = functions.push(functionCode) + + const tokenizedFunction = tokenizer(functionCode, parentId, blockId++) + if (tokenizedFunction) + functionTokens = functionTokens.push(tokenizedFunction) + } + }) + + return functionTokens +} + + +const outputFile = function(functionTokens) { + functionTokens.forEach((f) => { + //console.log("===") + console.log(f) + //console.log("===") + }) +} + +// TODO: check input +const fileName = process.argv[2] + +fs.readFileAsync(fileName).then((data) => { + outputFile(processFile(process.argv[3], data)) +}); diff --git a/tokenizers/javascript/package.json b/tokenizers/javascript/package.json new file mode 100644 index 000000000..20c489029 --- /dev/null +++ b/tokenizers/javascript/package.json @@ -0,0 +1,25 @@ +{ + "name": "jstokenizer", + "version": "1.0.0", + "description": "", + "main": "index.js", + "scripts": { + "test": "echo \"Error: no test specified\" && exit 1" + }, + "author": "Jakub Žitný (https://github.com/jakubzitny)", + "license": "ISC", + "repository": { + "url": "https://github.com/Mondego/SourcererCC.git", + "type": "git" + }, + "dependencies": { + "escodegen": "^1.8.0", + "esprima": "^2.7.2", + "esprima-ast-utils": "0.0.6", + "esprima-walk": "^0.1.0", + "estools": "^2.1.0", + "fs-extra-promise": "^0.3.1", + "immutable": "^3.8.1", + "lodash": "^4.13.1" + } +} diff --git a/tokenizers/javascript/readme.md b/tokenizers/javascript/readme.md new file mode 100644 index 000000000..61e18622e --- /dev/null +++ b/tokenizers/javascript/readme.md @@ -0,0 +1,6 @@ +# JavaScript tokenizer for SourcererCC + +- use `node 6.*` +- run `npm install` from this directory to install dependencies +- run as `node index.js /path/to/file.js` +- (carefully) use `batch.sh` to apply to larger dataset diff --git a/tokenizers/javascript/tokenizer.js b/tokenizers/javascript/tokenizer.js new file mode 100644 index 000000000..b543b9d2b --- /dev/null +++ b/tokenizers/javascript/tokenizer.js @@ -0,0 +1,110 @@ +const _ = require('lodash') +const immutable = require('immutable') +const fs = require('fs-extra-promise') +const esprima = require('esprima') + +const MAIN_DELIMITER = '@#@' +const COUNT_DELIMITER = '@@::@@' +const TOKEN_DELIMITER = ',' +const TOKEN_DELIMITER_REPLACEMENT = "_" +const WHITESPACES = /(\s+)/g + +const filterTokens = function (type, token) { + return token.type == type +} + + +// NOTE: http://esprima.org/doc/#usage +const tokenTypes = immutable.List.of( + 'Boolean', + 'Identifier', + 'Keyword', + 'Null', + 'Numeric', + 'Punctuator', + 'String', + 'RegularExpression' +) +// jakubkoo.. ako sa spraví to šedé s hviezdičkou, že dve a tri +// že kde presne interakcie sú v organizme +const tokenFilters = tokenTypes.map((tokenType) => { + return _.partial(filterTokens, tokenType) +}) + +// NOTE: Filter out hashbang lines +const HASHBANG = /^#!/ +const filterHashbangLine = function(code) { + const firstLineLoc = code.indexOf('\n') + const firstLine = code.slice(0, firstLineLoc).toString() + if (firstLine.search(HASHBANG) == -1) + return code + + return code.slice(firstLineLoc) +} + +// TODO: handle "#!/usr/bin/env node" +// TODO: handle +const tokenizer = function(code, parentId, blockId) { + const options = { } + tokensRaw = esprima.tokenize(filterHashbangLine(code), options) + + // TODO: refactor these + const tokens = immutable.List(tokensRaw).flatMap((token) => { + if (token.value.indexOf(TOKEN_DELIMITER) != -1) + const tokenDelimiters = new RegExep(TOKEN_DELIMITER, 'g') + token.value = + token.value.replace(tokenDelimiters, TOKEN_DELIMITER_REPLACEMENT) + + // NOTE: get rid of all whitespaces, dey sak + if (token.value.search(WHITESPACES) != -1) + token.value = token.value.replace(WHITESPACES, '') + + // NOTE: skip RegExes, SCC has weird problems with it + if (token.type == 'RegularExpression') + return immutable.List() + + //if (token.type != 'String') + return immutable.List.of(token); + + // NOTE: now it's string + // const stringTokensRaw = token.value.split(WHITESPACE) + // const stringTokens = stringTokensRaw.map((stringToken) => { + // return { value: stringToken } + // }) + // return immutable.List(stringTokens) + }) + + // TODO: reduce to map + // const filteredTokens = tokenFilters.map((tokenFilter) => { + // return tokens.filter(tokenFilter) + // }) + + let uniqueTokens = immutable.Map() + tokens.forEach((token) => { + if (uniqueTokens.has(token.value)) { + newUniqueTokens = uniqueTokens.updateIn( + [ token.value ], + (count) => { + return count + 1 + }) + } else { + newUniqueTokens = uniqueTokens.set(token.value, 1) + } + uniqueTokens = newUniqueTokens + }) + + const tokenPairs = uniqueTokens.map((count, token) => { + return `${token}${COUNT_DELIMITER}${count}` + }) + + if (tokenPairs.size == 0) + return '' + + const lhs = `${parentId},${blockId},` + const rhs = tokenPairs.join(TOKEN_DELIMITER) + const output = `${lhs}${MAIN_DELIMITER}${rhs}` + + return output +}; + +module.exports = tokenizer diff --git a/tokenizers/javascript/util.js b/tokenizers/javascript/util.js new file mode 100644 index 000000000..178380f76 --- /dev/null +++ b/tokenizers/javascript/util.js @@ -0,0 +1,10 @@ + +const base64FileName = function(fileName) { + const fileNameBuffer = Buffer.from(fileName) + return fileNameBuffer.toString('base64') +} + + +module.exports = { + base64FileName +}