diff --git a/cli/src/cli/commands/run.ts b/cli/src/cli/commands/run.ts index fbe5adf61..60038bf64 100644 --- a/cli/src/cli/commands/run.ts +++ b/cli/src/cli/commands/run.ts @@ -166,6 +166,12 @@ export function runCommand(program: Command): Command { x => parseFloat(x), Options.defaultKgramsInWindow ) + .option( + "-C, --include-comments", + Utils.indent( + "Include the comments during the tokenization process." + ) + ) .action(async (locations, options) => run(locations, { ...options , ...program.opts() })); } @@ -206,6 +212,7 @@ export async function run(locations: string[], options: RunOptions): Promise; + public abstract createTokenizer(options?: TokenizerOptions): Promise; } export class ProgrammingLanguage extends Language { @@ -54,10 +54,10 @@ export class ProgrammingLanguage extends Language { return this.languageModule; } - async createTokenizer(): Promise { + async createTokenizer(options?: TokenizerOptions): Promise { const { CodeTokenizer } = await import ("./tokenizer/codeTokenizer.js"); await this.loadLanguageModule(); - return new CodeTokenizer(this); + return new CodeTokenizer(this, options); } } @@ -86,13 +86,13 @@ export class CustomTokenizerLanguage extends Language { constructor( readonly name: string, readonly extensions: string[], - readonly customTokenizer: ((self: Language) => Promise) + readonly customTokenizer: ((self: Language, options?: TokenizerOptions) => Promise) ) { super(name, extensions); } - public async createTokenizer(): Promise { - return await this.customTokenizer(this); + public async createTokenizer(options?: TokenizerOptions): Promise { + return await this.customTokenizer(this, options); } } @@ -127,9 +127,9 @@ export class LanguagePicker { new ProgrammingLanguage("typescript", [".ts"]), new ProgrammingLanguage("tsx", [".tsx"]), new ProgrammingLanguage("verilog", [".v", ".vh"]), - new CustomTokenizerLanguage("char", [".txt", ".md"], async self => { + new CustomTokenizerLanguage("char", [".txt", ".md"], async (self, options) => { const { CharTokenizer } = await import("./tokenizer/charTokenizer.js"); - return new CharTokenizer(self); + return new CharTokenizer(self, options); }), ]; diff --git a/lib/src/lib/options.ts b/lib/src/lib/options.ts index 04e7e5129..c53dac250 100644 --- a/lib/src/lib/options.ts +++ b/lib/src/lib/options.ts @@ -11,6 +11,7 @@ export interface DolosOptions { sortBy: string | null; fragmentSortBy: string | null; kgramData: boolean; + includeComments: boolean; } export type CustomOptions = Partial; @@ -87,6 +88,9 @@ export class Options implements DolosOptions { return this.custom.kgramData == true; } + get includeComments(): boolean { + return this.custom.includeComments === true; + } get limitResults(): number | null { return definedOrNull(this.custom.limitResults); @@ -155,6 +159,7 @@ export class Options implements DolosOptions { sortBy: this.sortBy, fragmentSortBy: this.fragmentSortBy, kgramData: this.kgramData, + includeComments: this.includeComments, }; } diff --git a/lib/src/lib/tokenizer/codeTokenizer.ts b/lib/src/lib/tokenizer/codeTokenizer.ts index 514fe6333..e26610fb7 100644 --- a/lib/src/lib/tokenizer/codeTokenizer.ts +++ b/lib/src/lib/tokenizer/codeTokenizer.ts @@ -1,21 +1,21 @@ import { default as Parser, SyntaxNode } from "tree-sitter"; import { Region } from "@dodona/dolos-core"; -import { Token, Tokenizer } from "./tokenizer.js"; +import { Token, Tokenizer, TokenizerOptions } from "./tokenizer.js"; import { ProgrammingLanguage } from "../language.js"; export class CodeTokenizer extends Tokenizer { private readonly parser: Parser; - /** * Creates a new tokenizer of the given language. Will throw an error when the * given language is not supported. See Tokenizer.supportedLanguages for a * list of all supported languages. * * @param language The language to use for this tokenizer. + * @param options */ - constructor(language: ProgrammingLanguage) { - super(language); + constructor(language: ProgrammingLanguage, options?: TokenizerOptions) { + super(language, options); this.parser = new Parser(); this.parser.setLanguage(language.getLanguageModule()); } @@ -62,8 +62,11 @@ export class CodeTokenizer extends Tokenizer { node.endPosition.column ); - tokens.push(this.newToken("(", location)); - tokens.push(this.newToken(node.type, location)); + const includeToken = !node.type.includes("comment") || this.options.includeComments; + if (includeToken) { + tokens.push(this.newToken("(", location)); + tokens.push(this.newToken(node.type, location)); + } for (const child of node.namedChildren) { const [childStartRow, childStartCol] = this.tokenizeNode(child, tokens); @@ -75,7 +78,9 @@ export class CodeTokenizer extends Tokenizer { } } - tokens.push(this.newToken(")", location)); + if (includeToken) { + tokens.push(this.newToken(")", location)); + } // Also return the startRow and startCol, this can be used by the parent node. return [location.startRow, location.startCol]; diff --git a/lib/src/lib/tokenizer/tokenizer.ts b/lib/src/lib/tokenizer/tokenizer.ts index 24de6da1c..c0e7677bb 100644 --- a/lib/src/lib/tokenizer/tokenizer.ts +++ b/lib/src/lib/tokenizer/tokenizer.ts @@ -6,10 +6,13 @@ export interface Token { location: Region; } -export abstract class Tokenizer { +export type TokenizerOptions = Partial<{ + includeComments: boolean; +}> - constructor(public readonly language: Language) {} +export abstract class Tokenizer { + constructor(public readonly language: Language, protected readonly options: TokenizerOptions = {}) {} /** * Runs the parser on a given string. Returns a list of Tokens diff --git a/lib/src/test/snapshots/tokenizer.test.ts.md b/lib/src/test/snapshots/tokenizer.test.ts.md index 596461c56..f4b73991d 100644 --- a/lib/src/test/snapshots/tokenizer.test.ts.md +++ b/lib/src/test/snapshots/tokenizer.test.ts.md @@ -12,9 +12,6 @@ Generated by [AVA](https://avajs.dev). '(', 'program', '(', - 'comment', - ')', - '(', 'function_definition', '(', 'word', @@ -1081,9 +1078,6 @@ Generated by [AVA](https://avajs.dev). ')', ')', '(', - 'comment', - ')', - '(', 'list', '(', 'command', @@ -1793,9 +1787,6 @@ Generated by [AVA](https://avajs.dev). '(', 'compilation_unit', '(', - 'comment', - ')', - '(', 'using_directive', '(', 'identifier', @@ -6393,9 +6384,6 @@ Generated by [AVA](https://avajs.dev). ')', ')', '(', - 'comment', - ')', - '(', 'expression_statement', '(', 'assignment_expression', @@ -6771,9 +6759,6 @@ Generated by [AVA](https://avajs.dev). ')', ')', '(', - 'comment', - ')', - '(', 'expression_statement', '(', 'call_expression', @@ -9009,9 +8994,6 @@ Generated by [AVA](https://avajs.dev). '(', 'program', '(', - 'line_comment', - ')', - '(', 'class_declaration', '(', 'modifiers', @@ -10406,9 +10388,6 @@ Generated by [AVA](https://avajs.dev). '(', 'module', '(', - 'comment', - ')', - '(', 'function_definition', '(', 'identifier', @@ -11169,9 +11148,6 @@ Generated by [AVA](https://avajs.dev). ')', ')', '(', - 'comment', - ')', - '(', 'named_element', '(', 'class_definition', @@ -13352,9 +13328,6 @@ Generated by [AVA](https://avajs.dev). ')', ')', '(', - 'comment', - ')', - '(', 'named_element', '(', 'component_clause', @@ -13382,9 +13355,6 @@ Generated by [AVA](https://avajs.dev). ')', ')', '(', - 'comment', - ')', - '(', 'named_element', '(', 'component_clause', @@ -18716,9 +18686,6 @@ Generated by [AVA](https://avajs.dev). '(', 'braced_expression', '(', - 'comment', - ')', - '(', 'if_statement', '(', 'binary_operator', @@ -18959,9 +18926,6 @@ Generated by [AVA](https://avajs.dev). ')', ')', '(', - 'comment', - ')', - '(', 'call', '(', 'identifier', @@ -19040,9 +19004,6 @@ Generated by [AVA](https://avajs.dev). ')', ')', '(', - 'comment', - ')', - '(', 'binary_operator', '(', 'identifier', @@ -20827,9 +20788,6 @@ Generated by [AVA](https://avajs.dev). '(', 'ERROR', '(', - 'comment', - ')', - '(', 'keyword_drop', ')', '(', @@ -20994,9 +20952,6 @@ Generated by [AVA](https://avajs.dev). '(', 'program', '(', - 'comment', - ')', - '(', 'interface_declaration', '(', 'type_identifier', @@ -22906,9 +22861,6 @@ Generated by [AVA](https://avajs.dev). ')', ')', '(', - 'comment', - ')', - '(', 'variable_declaration', '(', 'variable_declarator', @@ -23045,9 +22997,6 @@ Generated by [AVA](https://avajs.dev). '(', 'source_file', '(', - 'comment', - ')', - '(', 'package_declaration', '(', 'package_identifier', @@ -23581,9 +23530,6 @@ Generated by [AVA](https://avajs.dev). ')', ')', '(', - 'comment', - ')', - '(', 'id_directive', '(', 'text_macro_identifier', @@ -23909,9 +23855,6 @@ Generated by [AVA](https://avajs.dev). ')', ')', '(', - 'comment', - ')', - '(', 'line_compiler_directive', '(', 'unsigned_number', @@ -24492,9 +24435,6 @@ Generated by [AVA](https://avajs.dev). 'ERROR', ')', '(', - 'comment', - ')', - '(', 'ordered_port_connection', '(', 'expression', @@ -24564,9 +24504,6 @@ Generated by [AVA](https://avajs.dev). ')', ')', '(', - 'comment', - ')', - '(', 'ordered_port_connection', '(', 'expression', @@ -24740,9 +24677,6 @@ Generated by [AVA](https://avajs.dev). ')', ')', '(', - 'comment', - ')', - '(', 'parameter_port_declaration', '(', 'parameter_declaration', @@ -24811,9 +24745,6 @@ Generated by [AVA](https://avajs.dev). ')', ')', ')', - '(', - 'comment', - ')', ')', '(', 'list_of_ports', @@ -25069,9 +25000,6 @@ Generated by [AVA](https://avajs.dev). ')', ')', '(', - 'comment', - ')', - '(', 'statement_or_null', '(', 'statement', @@ -25116,9 +25044,6 @@ Generated by [AVA](https://avajs.dev). ')', ')', ')', - '(', - 'comment', - ')', ')', ')', ')', @@ -25273,9 +25198,6 @@ Generated by [AVA](https://avajs.dev). ')', ')', '(', - 'comment', - ')', - '(', 'module_or_generate_item', '(', 'continuous_assign', @@ -25324,9 +25246,6 @@ Generated by [AVA](https://avajs.dev). ')', ')', '(', - 'comment', - ')', - '(', 'module_or_generate_item', '(', 'continuous_assign', @@ -25372,9 +25291,6 @@ Generated by [AVA](https://avajs.dev). ')', ')', '(', - 'comment', - ')', - '(', 'module_or_generate_item', '(', 'continuous_assign', diff --git a/lib/src/test/snapshots/tokenizer.test.ts.snap b/lib/src/test/snapshots/tokenizer.test.ts.snap index 28bed1aeb..6988619d8 100644 Binary files a/lib/src/test/snapshots/tokenizer.test.ts.snap and b/lib/src/test/snapshots/tokenizer.test.ts.snap differ diff --git a/lib/src/test/tokenizer.test.ts b/lib/src/test/tokenizer.test.ts index 017b10435..f3609d079 100644 --- a/lib/src/test/tokenizer.test.ts +++ b/lib/src/test/tokenizer.test.ts @@ -26,29 +26,6 @@ const languageFiles = { "verilog": "../samples/verilog/module.v" } as {[key: string]: string}; -const tokenLength = { - "../samples/bash/caesar.sh": 1185, - "../samples/c/caesar.c": 582, - "../samples/c-sharp/Caesar.cs": 606, - "../samples/char/caesar.txt": 3700, - "../samples/cpp/caesar.cpp": 801, - "../samples/elm/Caesar.elm": 753, - "../samples/go/Caesar.go": 1032, - "../samples/groovy/caesar.groovy": 282, - "../samples/java/Caesar.java": 522, - "../samples/javascript/sample.js": 861, - "../samples/python/caesar.py": 309, - "../samples/php/caesar.php": 411, - "../samples/modelica/sample.mo": 7542, - "../samples/r/caesar.R": 594, - "../samples/rust/caesar.rs": 774, - "../samples/scala/Caesar.scala": 366, - "../samples/sql/sample.sql": 543, - "../samples/tsx/sample.tsx": 1659, - "../samples/typescript/caesar.ts": 378, - "../samples/verilog/module.v": 2484 -} as {[key: string]: number}; - for (const [languageName, languageFile] of Object.entries(languageFiles)) { test(`LanguagePicker can find ${languageName} correctly by name`, async t => { const language = await new LanguagePicker().findLanguage(languageName); @@ -74,7 +51,6 @@ for (const [languageName, languageFile] of Object.entries(languageFiles)) { const { tokens } = tokenizer.tokenizeFile(file); t.truthy(tokens); t.snapshot(tokens, "stable tokenization"); - t.is(tokens.length, tokenLength[languageFile]); }); } @@ -201,3 +177,12 @@ test("should be able to correctly tokenize a loop", async t => { ); }); + +test("tokens should contain comments when includeComments is true", async t => { + const file = new File("comments.js", "let i = 0;\nwhile (i < 10) { // comment\n i += 1;\n}"); + const language = await (new LanguagePicker().findLanguage("javascript")); + + const tokenizer = await language.createTokenizer({ includeComments: true }); + const { tokens } = tokenizer.tokenizeFile(file); + t.true(tokens.includes("comment")); +}); \ No newline at end of file