From 457ff40ed8ef42a6de6e20d4a614fbd4a014d9b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=BDitn=C3=BD?= Date: Fri, 3 Jun 2016 17:51:28 +0200 Subject: [PATCH 01/10] add javascript tokenizer - copied from https://git.io/voeEf - works on both file and function level --- tokenizers/javascript/.gitignore | 6 ++ tokenizers/javascript/index.js | 88 ++++++++++++++++++++++++++++++ tokenizers/javascript/package.json | 21 +++++++ tokenizers/javascript/readme.md | 5 ++ tokenizers/javascript/tokenizer.js | 65 ++++++++++++++++++++++ tokenizers/javascript/util.js | 10 ++++ 6 files changed, 195 insertions(+) create mode 100644 tokenizers/javascript/.gitignore create mode 100644 tokenizers/javascript/index.js create mode 100644 tokenizers/javascript/package.json create mode 100644 tokenizers/javascript/readme.md create mode 100644 tokenizers/javascript/tokenizer.js create mode 100644 tokenizers/javascript/util.js diff --git a/tokenizers/javascript/.gitignore b/tokenizers/javascript/.gitignore new file mode 100644 index 000000000..c9e792f77 --- /dev/null +++ b/tokenizers/javascript/.gitignore @@ -0,0 +1,6 @@ +*.swp +*.swo +*.log + +node_modules +.DS_Store diff --git a/tokenizers/javascript/index.js b/tokenizers/javascript/index.js new file mode 100644 index 000000000..1cb3601e4 --- /dev/null +++ b/tokenizers/javascript/index.js @@ -0,0 +1,88 @@ +const esprima = require('esprima') +const escodegen = require('escodegen') +const fs = require('fs-extra-promise') +const tokenizer = require('./tokenizer') + +const immutable = require('immutable') +const walk = require('esprima-walk') + +const { base64FileName } = require('./util') + +const estools = require('estools') + +const TOKENIZER_SCOPE_FILE = 'file-scope' +const TOKENIZER_SCOPE_FUNCTION = 'function-scope' + +const TOKENIZER_SCOPE = TOKENIZER_SCOPE_FILE + +// TODO: estools map / filter / traverse (instead of walk) +// - filter subfunctions from fuction asts somehow +// - test on SCC + +// TODO: get rid of the function block and indentation +const regenerateFunctionCode = function(functionAst) { + codegenOptions = { // NOTE: doesn't help + format: { + parentheses: false + } + } + + // NOTE: functionAst.body ommits the function signature (returns block only) + return escodegen.generate(functionAst.body, {}) +} + +const processFile = function(fileName, data) { + //let parentId = base64FileName(fileName) // TODO: incorporate repo name / hash + let parentId = fileName + let blockId = 1 + + if (TOKENIZER_SCOPE === TOKENIZER_SCOPE_FILE) { + return immutable.List.of(tokenizer(data, parentId, blockId)) + } + + options = { + loc: true, + range: true, + comment: true, + attachComment: true + } + fileAst = esprima.parse(data, {}); + + let functions = immutable.List() + let functionTokens = immutable.List() + walk(fileAst, (node) => { + if (node.type == 'FunctionExpression') { + // const functionAstShallow = estools.map(node, (subNode) => { + // if (subNode === undefined || subNode.type === undefined) return + // if (subNode.type == 'FunctionExpression') + // return {} + // else return subNode + // }) + //console.log(functionAstShallow) + //process.exit(1) + const functionAstShallow = node + const functionCode = regenerateFunctionCode(functionAstShallow) + functions = functions.push(functionCode) + const tokenizedFunction = tokenizer(functionCode, parentId, blockId++) + functionTokens = functionTokens.push(tokenizedFunction) + } + }) + + return functionTokens +} + + +const outputFile = function(functionTokens) { + functionTokens.forEach((f) => { + //console.log("===") + console.log(f) + //console.log("===") + }) +} + +// TODO: check input +const fileName = process.argv[2] + +fs.readFileAsync(fileName).then((data) => { + outputFile(processFile(process.argv[3], data)) +}); diff --git a/tokenizers/javascript/package.json b/tokenizers/javascript/package.json new file mode 100644 index 000000000..fff59c7e8 --- /dev/null +++ b/tokenizers/javascript/package.json @@ -0,0 +1,21 @@ +{ + "name": "jstokenizer", + "version": "1.0.0", + "description": "", + "main": "index.js", + "scripts": { + "test": "echo \"Error: no test specified\" && exit 1" + }, + "author": "Jakub Žitný (https://github.com/jakubzitny)", + "license": "ISC", + "dependencies": { + "escodegen": "^1.8.0", + "esprima": "^2.7.2", + "esprima-ast-utils": "0.0.6", + "esprima-walk": "^0.1.0", + "estools": "^2.1.0", + "fs-extra-promise": "^0.3.1", + "immutable": "^3.8.1", + "lodash": "^4.13.1" + } +} diff --git a/tokenizers/javascript/readme.md b/tokenizers/javascript/readme.md new file mode 100644 index 000000000..58ca4bce4 --- /dev/null +++ b/tokenizers/javascript/readme.md @@ -0,0 +1,5 @@ +# JavaScript tokenizer for SourcererCC + +- use `node 6.*` +- run as `node index.js /path/to/file.js` +- (carefully) use `batch.sh` to apply to larger dataset diff --git a/tokenizers/javascript/tokenizer.js b/tokenizers/javascript/tokenizer.js new file mode 100644 index 000000000..c04cad4ce --- /dev/null +++ b/tokenizers/javascript/tokenizer.js @@ -0,0 +1,65 @@ +const _ = require('lodash') +const immutable = require('immutable') +const fs = require('fs-extra-promise') +const esprima = require('esprima') + +const MAIN_DELIMITER = '@#@' +const COUNT_DELIMITER = '@@::@@' +const TOKEN_DELIMITER = ',' + +const filterTokens = function (type, token) { + return token.type == type +} + + +// NOTE: http://esprima.org/doc/#usage +const tokenTypes = immutable.List.of( + 'Boolean', + 'Identifier', + 'Keyword', + 'Null', + 'Numeric', + 'Punctuator', + 'String', + 'RegularExpression' +) + +const tokenFilters = tokenTypes.map((tokenType) => { + return _.partial(filterTokens, tokenType) +}) + +const tokenizer = function(code, parentId, blockId) { + const options = { } + const tokens = immutable.List(esprima.tokenize(code, options)) + + // TODO: reduce to map + // const filteredTokens = tokenFilters.map((tokenFilter) => { + // return tokens.filter(tokenFilter) + // }) + + let uniqueTokens = immutable.Map() + tokens.forEach((token) => { + if (uniqueTokens.has(token.value)) { + newUniqueTokens = uniqueTokens.updateIn( + [ token.value ], + (count) => { + return count + 1 + }) + } else { + newUniqueTokens = uniqueTokens.set(token.value, 1) + } + uniqueTokens = newUniqueTokens + }) + + const tokenPairs = uniqueTokens.map((count, token) => { + return `${token}${COUNT_DELIMITER}${count}` + }) + + const lhs = `${parentId},${blockId},` + const rhs = tokenPairs.join(TOKEN_DELIMITER) + const output = `${lhs}${MAIN_DELIMITER}${rhs}` + + return output +}; + +module.exports = tokenizer diff --git a/tokenizers/javascript/util.js b/tokenizers/javascript/util.js new file mode 100644 index 000000000..178380f76 --- /dev/null +++ b/tokenizers/javascript/util.js @@ -0,0 +1,10 @@ + +const base64FileName = function(fileName) { + const fileNameBuffer = Buffer.from(fileName) + return fileNameBuffer.toString('base64') +} + + +module.exports = { + base64FileName +} From eb4990512306c64e8ce60d156a57de94bc36714b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=BDitn=C3=BD?= Date: Sat, 4 Jun 2016 15:00:31 +0200 Subject: [PATCH 02/10] add repository field to package.json --- tokenizers/javascript/package.json | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tokenizers/javascript/package.json b/tokenizers/javascript/package.json index fff59c7e8..20c489029 100644 --- a/tokenizers/javascript/package.json +++ b/tokenizers/javascript/package.json @@ -8,6 +8,10 @@ }, "author": "Jakub Žitný (https://github.com/jakubzitny)", "license": "ISC", + "repository": { + "url": "https://github.com/Mondego/SourcererCC.git", + "type": "git" + }, "dependencies": { "escodegen": "^1.8.0", "esprima": "^2.7.2", From 8ea911689e23a2c52170bfd64068b313be9fe158 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=BDitn=C3=BD?= Date: Fri, 17 Jun 2016 16:50:03 +0200 Subject: [PATCH 03/10] improve whitespace handling in js tokenizer --- tokenizers/javascript/tokenizer.js | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/tokenizers/javascript/tokenizer.js b/tokenizers/javascript/tokenizer.js index c04cad4ce..bd0366e12 100644 --- a/tokenizers/javascript/tokenizer.js +++ b/tokenizers/javascript/tokenizer.js @@ -6,6 +6,8 @@ const esprima = require('esprima') const MAIN_DELIMITER = '@#@' const COUNT_DELIMITER = '@@::@@' const TOKEN_DELIMITER = ',' +const TOKEN_DELIMITER_REPLACEMENT = "_" +const WHITESPACE = /(\s+)/ const filterTokens = function (type, token) { return token.type == type @@ -30,7 +32,22 @@ const tokenFilters = tokenTypes.map((tokenType) => { const tokenizer = function(code, parentId, blockId) { const options = { } - const tokens = immutable.List(esprima.tokenize(code, options)) + // TODO: refactor this + const tokens = immutable.List(esprima.tokenize(code, options)).flatMap((token) => { + if (token.value.indexOf(TOKEN_DELIMITER) != -1) + token.value = + token.value.replace(TOKEN_DELIMITER, TOKEN_DELIMITER_REPLACEMENT) + + if (token.type != 'String') + return immutable.List.of(token); + + // NOTE: now it's string + const stringTokensRaw = token.value.split(WHITESPACE) + const stringTokens = stringTokensRaw.map((stringToken) => { + return { value: stringToken } + }) + return immutable.List(stringTokens) + }) // TODO: reduce to map // const filteredTokens = tokenFilters.map((tokenFilter) => { From 10fde12c1893654d5fe11cb3e5153c5214266a1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=BDitn=C3=BD?= Date: Fri, 17 Jun 2016 17:42:42 +0200 Subject: [PATCH 04/10] improve whitespaces and hasbang line handling --- tokenizers/javascript/tokenizer.js | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/tokenizers/javascript/tokenizer.js b/tokenizers/javascript/tokenizer.js index bd0366e12..2add5adf1 100644 --- a/tokenizers/javascript/tokenizer.js +++ b/tokenizers/javascript/tokenizer.js @@ -8,6 +8,7 @@ const COUNT_DELIMITER = '@@::@@' const TOKEN_DELIMITER = ',' const TOKEN_DELIMITER_REPLACEMENT = "_" const WHITESPACE = /(\s+)/ +const HASHBANG_LINE = '#!/usr/bin/env node' const filterTokens = function (type, token) { return token.type == type @@ -30,14 +31,29 @@ const tokenFilters = tokenTypes.map((tokenType) => { return _.partial(filterTokens, tokenType) }) +// TODO: handle "#!/usr/bin/env node" +// TODO: handle const tokenizer = function(code, parentId, blockId) { const options = { } - // TODO: refactor this + // TODO: refactor these + // NOTE: handle hashbang line + const firstLineOfCode = code.toString().substr(0, HASHBANG_LINE.length) + if (firstLineOfCode.indexOf(HASHBANG_LINE) != -1) + code = Buffer.from(code.toString().substr(HASHBANG_LINE.length)) + const tokens = immutable.List(esprima.tokenize(code, options)).flatMap((token) => { if (token.value.indexOf(TOKEN_DELIMITER) != -1) token.value = token.value.replace(TOKEN_DELIMITER, TOKEN_DELIMITER_REPLACEMENT) + // NOTE: get rid of all whitespaces, dey sak + if (token.value.indexOf(WHITESPACE) != -1) + token.value = token.value.replace(WHITESPACE, '') + + // NOTE: skip RegExes, SCC has weird problems with it + if (token.type == 'RegularExpression') + return immutable.List() + if (token.type != 'String') return immutable.List.of(token); From 8ab4407af1aed4adc8c5d22d7f3b9ea8a56f0a4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=BDitn=C3=BD?= Date: Sun, 19 Jun 2016 22:27:40 +0200 Subject: [PATCH 05/10] improve readme for JS tokenizer --- tokenizers/javascript/readme.md | 1 + 1 file changed, 1 insertion(+) diff --git a/tokenizers/javascript/readme.md b/tokenizers/javascript/readme.md index 58ca4bce4..61e18622e 100644 --- a/tokenizers/javascript/readme.md +++ b/tokenizers/javascript/readme.md @@ -1,5 +1,6 @@ # JavaScript tokenizer for SourcererCC - use `node 6.*` +- run `npm install` from this directory to install dependencies - run as `node index.js /path/to/file.js` - (carefully) use `batch.sh` to apply to larger dataset From bd4272b326e805f8c73eb8a7aefc1672fa3889ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=BDitn=C3=BD?= Date: Mon, 20 Jun 2016 01:20:25 +0200 Subject: [PATCH 06/10] refactor hashbang and whitespaces handling --- tokenizers/javascript/tokenizer.js | 46 ++++++++++++++++++------------ 1 file changed, 27 insertions(+), 19 deletions(-) diff --git a/tokenizers/javascript/tokenizer.js b/tokenizers/javascript/tokenizer.js index 2add5adf1..08bf1fb87 100644 --- a/tokenizers/javascript/tokenizer.js +++ b/tokenizers/javascript/tokenizer.js @@ -7,8 +7,7 @@ const MAIN_DELIMITER = '@#@' const COUNT_DELIMITER = '@@::@@' const TOKEN_DELIMITER = ',' const TOKEN_DELIMITER_REPLACEMENT = "_" -const WHITESPACE = /(\s+)/ -const HASHBANG_LINE = '#!/usr/bin/env node' +const WHITESPACES = /(\s+)/g const filterTokens = function (type, token) { return token.type == type @@ -21,48 +20,57 @@ const tokenTypes = immutable.List.of( 'Identifier', 'Keyword', 'Null', - 'Numeric', + 'Numeric', 'Punctuator', 'String', 'RegularExpression' ) - +// jakubkoo.. ako sa spraví to šedé s hviezdičkou, že dve a tri +// že kde presne interakcie sú v organizme const tokenFilters = tokenTypes.map((tokenType) => { return _.partial(filterTokens, tokenType) }) +// NOTE: Filter out hashbang lines +const HASHBANG = /^#!/ +const filterHashbangLine = function(code) { + const firstLineLoc = code.indexOf('\n') + const firstLine = code.slice(0, firstLineLoc).toString() + if (firstLine.search(HASHBANG) == -1) + return code + + return code.slice(firstLineLoc) +} + // TODO: handle "#!/usr/bin/env node" // TODO: handle const tokenizer = function(code, parentId, blockId) { const options = { } - // TODO: refactor these - // NOTE: handle hashbang line - const firstLineOfCode = code.toString().substr(0, HASHBANG_LINE.length) - if (firstLineOfCode.indexOf(HASHBANG_LINE) != -1) - code = Buffer.from(code.toString().substr(HASHBANG_LINE.length)) + tokensRaw = esprima.tokenize(filterHashbangLine(code), options) - const tokens = immutable.List(esprima.tokenize(code, options)).flatMap((token) => { + // TODO: refactor these + const tokens = immutable.List(tokensRaw).flatMap((token) => { if (token.value.indexOf(TOKEN_DELIMITER) != -1) token.value = token.value.replace(TOKEN_DELIMITER, TOKEN_DELIMITER_REPLACEMENT) // NOTE: get rid of all whitespaces, dey sak - if (token.value.indexOf(WHITESPACE) != -1) - token.value = token.value.replace(WHITESPACE, '') + if (token.value.search(WHITESPACES) != -1) + token.value = token.value.replace(WHITESPACES, '') // NOTE: skip RegExes, SCC has weird problems with it if (token.type == 'RegularExpression') return immutable.List() - if (token.type != 'String') - return immutable.List.of(token); + //if (token.type != 'String') + return immutable.List.of(token); // NOTE: now it's string - const stringTokensRaw = token.value.split(WHITESPACE) - const stringTokens = stringTokensRaw.map((stringToken) => { - return { value: stringToken } - }) - return immutable.List(stringTokens) + // const stringTokensRaw = token.value.split(WHITESPACE) + // const stringTokens = stringTokensRaw.map((stringToken) => { + // return { value: stringToken } + // }) + // return immutable.List(stringTokens) }) // TODO: reduce to map From d41550e99dfd538e641d6f8a03e3e0dacbb03f2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=BDitn=C3=BD?= Date: Mon, 20 Jun 2016 01:49:04 +0200 Subject: [PATCH 07/10] skip empty function tokens --- tokenizers/javascript/index.js | 4 +++- tokenizers/javascript/tokenizer.js | 3 +++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/tokenizers/javascript/index.js b/tokenizers/javascript/index.js index 1cb3601e4..7b20f7364 100644 --- a/tokenizers/javascript/index.js +++ b/tokenizers/javascript/index.js @@ -63,8 +63,10 @@ const processFile = function(fileName, data) { const functionAstShallow = node const functionCode = regenerateFunctionCode(functionAstShallow) functions = functions.push(functionCode) + const tokenizedFunction = tokenizer(functionCode, parentId, blockId++) - functionTokens = functionTokens.push(tokenizedFunction) + if (tokenizedFunction) + functionTokens = functionTokens.push(tokenizedFunction) } }) diff --git a/tokenizers/javascript/tokenizer.js b/tokenizers/javascript/tokenizer.js index 08bf1fb87..d8716809f 100644 --- a/tokenizers/javascript/tokenizer.js +++ b/tokenizers/javascript/tokenizer.js @@ -96,6 +96,9 @@ const tokenizer = function(code, parentId, blockId) { return `${token}${COUNT_DELIMITER}${count}` }) + if (tokenPairs.size == 0) + return '' + const lhs = `${parentId},${blockId},` const rhs = tokenPairs.join(TOKEN_DELIMITER) const output = `${lhs}${MAIN_DELIMITER}${rhs}` From 962360b54b9dd0b80d86dbd3798671c3e2cc58d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=BDitn=C3=BD?= Date: Mon, 20 Jun 2016 01:59:34 +0200 Subject: [PATCH 08/10] fix token delimiter replacement for all occurences --- tokenizers/javascript/tokenizer.js | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tokenizers/javascript/tokenizer.js b/tokenizers/javascript/tokenizer.js index d8716809f..93b446c40 100644 --- a/tokenizers/javascript/tokenizer.js +++ b/tokenizers/javascript/tokenizer.js @@ -51,8 +51,9 @@ const tokenizer = function(code, parentId, blockId) { // TODO: refactor these const tokens = immutable.List(tokensRaw).flatMap((token) => { if (token.value.indexOf(TOKEN_DELIMITER) != -1) + tokenDelimiters = new RegExep(TOKEN_DELIMITER, 'g') token.value = - token.value.replace(TOKEN_DELIMITER, TOKEN_DELIMITER_REPLACEMENT) + token.value.replace(tokenDelimiters, TOKEN_DELIMITER_REPLACEMENT) // NOTE: get rid of all whitespaces, dey sak if (token.value.search(WHITESPACES) != -1) From 49cd0c299445f7322e46c0a75c129e35c1199602 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=BDitn=C3=BD?= Date: Mon, 20 Jun 2016 01:59:34 +0200 Subject: [PATCH 09/10] fix token delimiter replacement for all occurences --- tokenizers/javascript/tokenizer.js | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tokenizers/javascript/tokenizer.js b/tokenizers/javascript/tokenizer.js index d8716809f..b543b9d2b 100644 --- a/tokenizers/javascript/tokenizer.js +++ b/tokenizers/javascript/tokenizer.js @@ -51,8 +51,9 @@ const tokenizer = function(code, parentId, blockId) { // TODO: refactor these const tokens = immutable.List(tokensRaw).flatMap((token) => { if (token.value.indexOf(TOKEN_DELIMITER) != -1) + const tokenDelimiters = new RegExep(TOKEN_DELIMITER, 'g') token.value = - token.value.replace(TOKEN_DELIMITER, TOKEN_DELIMITER_REPLACEMENT) + token.value.replace(tokenDelimiters, TOKEN_DELIMITER_REPLACEMENT) // NOTE: get rid of all whitespaces, dey sak if (token.value.search(WHITESPACES) != -1) From fce1f149b1e99f690bd4d7438476d7d6ec58871f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20=C5=BDitn=C3=BD?= Date: Tue, 30 Aug 2016 11:18:58 +0200 Subject: [PATCH 10/10] fix wrongly rebased const fix in tokenizer.js --- tokenizers/javascript/tokenizer.js | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tokenizers/javascript/tokenizer.js b/tokenizers/javascript/tokenizer.js index 773c2173b..b543b9d2b 100644 --- a/tokenizers/javascript/tokenizer.js +++ b/tokenizers/javascript/tokenizer.js @@ -51,11 +51,7 @@ const tokenizer = function(code, parentId, blockId) { // TODO: refactor these const tokens = immutable.List(tokensRaw).flatMap((token) => { if (token.value.indexOf(TOKEN_DELIMITER) != -1) -<<<<<<< HEAD - tokenDelimiters = new RegExep(TOKEN_DELIMITER, 'g') -======= const tokenDelimiters = new RegExep(TOKEN_DELIMITER, 'g') ->>>>>>> 49cd0c299445f7322e46c0a75c129e35c1199602 token.value = token.value.replace(tokenDelimiters, TOKEN_DELIMITER_REPLACEMENT)