diff --git a/cli/src/cli/commands/run.ts b/cli/src/cli/commands/run.ts index a3daca1ed..53c6314a2 100644 --- a/cli/src/cli/commands/run.ts +++ b/cli/src/cli/commands/run.ts @@ -167,9 +167,9 @@ export function runCommand(program: Command): Command { Options.defaultKgramsInWindow ) .option( - "-e, --exclude-comments", + "-ic, --include-comments", Utils.indent( - "Exclude the comments during the tokenization process." + "Include the comments during the tokenization process." ) ) .action(async (locations, options) => run(locations, { ...options , ...program.opts() })); @@ -212,7 +212,7 @@ export async function run(locations: string[], options: RunOptions): Promise; + public abstract createTokenizer(options?: TokenizerOptions): Promise; } export class ProgrammingLanguage extends Language { @@ -54,7 +54,7 @@ export class ProgrammingLanguage extends Language { return this.languageModule; } - async createTokenizer(options: TokenizerOptions): Promise { + async createTokenizer(options?: TokenizerOptions): Promise { const { CodeTokenizer } = await import ("./tokenizer/codeTokenizer.js"); await this.loadLanguageModule(); return new CodeTokenizer(this, options); @@ -86,12 +86,12 @@ export class CustomTokenizerLanguage extends Language { constructor( readonly name: string, readonly extensions: string[], - readonly customTokenizer: ((self: Language, options: TokenizerOptions) => Promise) + readonly customTokenizer: ((self: Language, options?: TokenizerOptions) => Promise) ) { super(name, extensions); } - public async createTokenizer(options: TokenizerOptions): Promise { + public async createTokenizer(options?: TokenizerOptions): Promise { return await this.customTokenizer(this, options); } } diff --git a/lib/src/lib/options.ts b/lib/src/lib/options.ts index fdfa26cbb..c53dac250 100644 --- a/lib/src/lib/options.ts +++ b/lib/src/lib/options.ts @@ -11,7 +11,7 @@ export interface DolosOptions { sortBy: string | null; fragmentSortBy: string | null; kgramData: boolean; - excludeComments: boolean; + includeComments: boolean; } export type CustomOptions = Partial; @@ -88,8 +88,8 @@ export class Options implements DolosOptions { return this.custom.kgramData == true; } - get excludeComments(): boolean { - return this.custom.excludeComments === true; + get includeComments(): boolean { + return this.custom.includeComments === true; } get limitResults(): number | null { @@ -159,7 +159,7 @@ export class Options implements DolosOptions { sortBy: this.sortBy, fragmentSortBy: this.fragmentSortBy, kgramData: this.kgramData, - excludeComments: this.excludeComments, + includeComments: this.includeComments, }; } diff --git a/lib/src/lib/tokenizer/codeTokenizer.ts b/lib/src/lib/tokenizer/codeTokenizer.ts index 79ebde3a8..d919969f8 100644 --- a/lib/src/lib/tokenizer/codeTokenizer.ts +++ b/lib/src/lib/tokenizer/codeTokenizer.ts @@ -14,7 +14,7 @@ export class CodeTokenizer extends Tokenizer { * @param language The language to use for this tokenizer. * @param options */ - constructor(language: ProgrammingLanguage, options: TokenizerOptions) { + constructor(language: ProgrammingLanguage, options?: TokenizerOptions) { super(language, options); this.parser = new Parser(); this.parser.setLanguage(language.getLanguageModule()); @@ -63,7 +63,7 @@ export class CodeTokenizer extends Tokenizer { ); const isComment = node.type.includes("comment"); - if (!this.options.excludeComments || !isComment) { + if (!isComment || this.options.includeComments) { tokens.push(this.newToken("(", location)); tokens.push(this.newToken(node.type, location)); } @@ -78,7 +78,7 @@ export class CodeTokenizer extends Tokenizer { } } - if (!this.options.excludeComments || !isComment) { + if (!isComment || this.options.includeComments) { tokens.push(this.newToken(")", location)); } diff --git a/lib/src/lib/tokenizer/tokenizer.ts b/lib/src/lib/tokenizer/tokenizer.ts index c0e1f7c58..c66967e74 100644 --- a/lib/src/lib/tokenizer/tokenizer.ts +++ b/lib/src/lib/tokenizer/tokenizer.ts @@ -6,16 +6,18 @@ export interface Token { location: Region; } -export type TokenizerOptions = { - excludeComments: boolean; -} +export type TokenizerOptions = Partial<{ + includeComments: boolean; +}> export abstract class Tokenizer { - protected options: TokenizerOptions; + protected options: TokenizerOptions = {}; - constructor(public readonly language: Language, options: TokenizerOptions) { - this.options = options; + constructor(public readonly language: Language, options?: TokenizerOptions) { + if (options !== undefined) { + this.options = options; + } } diff --git a/lib/src/test/snapshots/tokenizer.test.ts.md b/lib/src/test/snapshots/tokenizer.test.ts.md index 596461c56..f4b73991d 100644 --- a/lib/src/test/snapshots/tokenizer.test.ts.md +++ b/lib/src/test/snapshots/tokenizer.test.ts.md @@ -12,9 +12,6 @@ Generated by [AVA](https://avajs.dev). '(', 'program', '(', - 'comment', - ')', - '(', 'function_definition', '(', 'word', @@ -1081,9 +1078,6 @@ Generated by [AVA](https://avajs.dev). ')', ')', '(', - 'comment', - ')', - '(', 'list', '(', 'command', @@ -1793,9 +1787,6 @@ Generated by [AVA](https://avajs.dev). '(', 'compilation_unit', '(', - 'comment', - ')', - '(', 'using_directive', '(', 'identifier', @@ -6393,9 +6384,6 @@ Generated by [AVA](https://avajs.dev). ')', ')', '(', - 'comment', - ')', - '(', 'expression_statement', '(', 'assignment_expression', @@ -6771,9 +6759,6 @@ Generated by [AVA](https://avajs.dev). ')', ')', '(', - 'comment', - ')', - '(', 'expression_statement', '(', 'call_expression', @@ -9009,9 +8994,6 @@ Generated by [AVA](https://avajs.dev). '(', 'program', '(', - 'line_comment', - ')', - '(', 'class_declaration', '(', 'modifiers', @@ -10406,9 +10388,6 @@ Generated by [AVA](https://avajs.dev). '(', 'module', '(', - 'comment', - ')', - '(', 'function_definition', '(', 'identifier', @@ -11169,9 +11148,6 @@ Generated by [AVA](https://avajs.dev). ')', ')', '(', - 'comment', - ')', - '(', 'named_element', '(', 'class_definition', @@ -13352,9 +13328,6 @@ Generated by [AVA](https://avajs.dev). ')', ')', '(', - 'comment', - ')', - '(', 'named_element', '(', 'component_clause', @@ -13382,9 +13355,6 @@ Generated by [AVA](https://avajs.dev). ')', ')', '(', - 'comment', - ')', - '(', 'named_element', '(', 'component_clause', @@ -18716,9 +18686,6 @@ Generated by [AVA](https://avajs.dev). '(', 'braced_expression', '(', - 'comment', - ')', - '(', 'if_statement', '(', 'binary_operator', @@ -18959,9 +18926,6 @@ Generated by [AVA](https://avajs.dev). ')', ')', '(', - 'comment', - ')', - '(', 'call', '(', 'identifier', @@ -19040,9 +19004,6 @@ Generated by [AVA](https://avajs.dev). ')', ')', '(', - 'comment', - ')', - '(', 'binary_operator', '(', 'identifier', @@ -20827,9 +20788,6 @@ Generated by [AVA](https://avajs.dev). '(', 'ERROR', '(', - 'comment', - ')', - '(', 'keyword_drop', ')', '(', @@ -20994,9 +20952,6 @@ Generated by [AVA](https://avajs.dev). '(', 'program', '(', - 'comment', - ')', - '(', 'interface_declaration', '(', 'type_identifier', @@ -22906,9 +22861,6 @@ Generated by [AVA](https://avajs.dev). ')', ')', '(', - 'comment', - ')', - '(', 'variable_declaration', '(', 'variable_declarator', @@ -23045,9 +22997,6 @@ Generated by [AVA](https://avajs.dev). '(', 'source_file', '(', - 'comment', - ')', - '(', 'package_declaration', '(', 'package_identifier', @@ -23581,9 +23530,6 @@ Generated by [AVA](https://avajs.dev). ')', ')', '(', - 'comment', - ')', - '(', 'id_directive', '(', 'text_macro_identifier', @@ -23909,9 +23855,6 @@ Generated by [AVA](https://avajs.dev). ')', ')', '(', - 'comment', - ')', - '(', 'line_compiler_directive', '(', 'unsigned_number', @@ -24492,9 +24435,6 @@ Generated by [AVA](https://avajs.dev). 'ERROR', ')', '(', - 'comment', - ')', - '(', 'ordered_port_connection', '(', 'expression', @@ -24564,9 +24504,6 @@ Generated by [AVA](https://avajs.dev). ')', ')', '(', - 'comment', - ')', - '(', 'ordered_port_connection', '(', 'expression', @@ -24740,9 +24677,6 @@ Generated by [AVA](https://avajs.dev). ')', ')', '(', - 'comment', - ')', - '(', 'parameter_port_declaration', '(', 'parameter_declaration', @@ -24811,9 +24745,6 @@ Generated by [AVA](https://avajs.dev). ')', ')', ')', - '(', - 'comment', - ')', ')', '(', 'list_of_ports', @@ -25069,9 +25000,6 @@ Generated by [AVA](https://avajs.dev). ')', ')', '(', - 'comment', - ')', - '(', 'statement_or_null', '(', 'statement', @@ -25116,9 +25044,6 @@ Generated by [AVA](https://avajs.dev). ')', ')', ')', - '(', - 'comment', - ')', ')', ')', ')', @@ -25273,9 +25198,6 @@ Generated by [AVA](https://avajs.dev). ')', ')', '(', - 'comment', - ')', - '(', 'module_or_generate_item', '(', 'continuous_assign', @@ -25324,9 +25246,6 @@ Generated by [AVA](https://avajs.dev). ')', ')', '(', - 'comment', - ')', - '(', 'module_or_generate_item', '(', 'continuous_assign', @@ -25372,9 +25291,6 @@ Generated by [AVA](https://avajs.dev). ')', ')', '(', - 'comment', - ')', - '(', 'module_or_generate_item', '(', 'continuous_assign', diff --git a/lib/src/test/snapshots/tokenizer.test.ts.snap b/lib/src/test/snapshots/tokenizer.test.ts.snap index 28bed1aeb..6988619d8 100644 Binary files a/lib/src/test/snapshots/tokenizer.test.ts.snap and b/lib/src/test/snapshots/tokenizer.test.ts.snap differ diff --git a/lib/src/test/tokenizer.test.ts b/lib/src/test/tokenizer.test.ts index 781ebf918..b4779b38b 100644 --- a/lib/src/test/tokenizer.test.ts +++ b/lib/src/test/tokenizer.test.ts @@ -27,26 +27,26 @@ const languageFiles = { } as {[key: string]: string}; const tokenLength = { - "../samples/bash/caesar.sh": 1185, + "../samples/bash/caesar.sh": 1179, "../samples/c/caesar.c": 582, - "../samples/c-sharp/Caesar.cs": 606, + "../samples/c-sharp/Caesar.cs": 603, "../samples/char/caesar.txt": 3700, - "../samples/cpp/caesar.cpp": 801, + "../samples/cpp/caesar.cpp": 795, "../samples/elm/Caesar.elm": 753, "../samples/go/Caesar.go": 1032, "../samples/groovy/caesar.groovy": 282, - "../samples/java/Caesar.java": 522, + "../samples/java/Caesar.java": 519, "../samples/javascript/sample.js": 861, - "../samples/python/caesar.py": 309, + "../samples/python/caesar.py": 306, "../samples/php/caesar.php": 411, - "../samples/modelica/sample.mo": 7542, - "../samples/r/caesar.R": 594, + "../samples/modelica/sample.mo": 7533, + "../samples/r/caesar.R": 585, "../samples/rust/caesar.rs": 774, "../samples/scala/Caesar.scala": 366, - "../samples/sql/sample.sql": 543, - "../samples/tsx/sample.tsx": 1659, - "../samples/typescript/caesar.ts": 378, - "../samples/verilog/module.v": 2484 + "../samples/sql/sample.sql": 540, + "../samples/tsx/sample.tsx": 1656, + "../samples/typescript/caesar.ts": 375, + "../samples/verilog/module.v": 2448 } as {[key: string]: number}; for (const [languageName, languageFile] of Object.entries(languageFiles)) { @@ -68,7 +68,7 @@ for (const [languageName, languageFile] of Object.entries(languageFiles)) { const file = (await readPath(languageFile)).ok(); const language = new LanguagePicker().detectLanguage([file]); - const tokenizer = await language.createTokenizer({ excludeComments: false }); + const tokenizer = await language.createTokenizer(); t.truthy(tokenizer); const { tokens } = tokenizer.tokenizeFile(file); @@ -96,7 +96,7 @@ test("should be able to use external tree-sitter parsers (tree-sitter-json)", as const file = (await readPath("./package.json")).ok(); const language = await (new LanguagePicker().findLanguage("json")); - const tokenizer = await language.createTokenizer({ excludeComments: false }); + const tokenizer = await language.createTokenizer(); t.truthy(tokenizer); const { tokens } = tokenizer.tokenizeFile(file); @@ -107,7 +107,7 @@ test("should be able to parse larger files", async t => { const file = new File("long.js", "var test = 1;\n".repeat(10000)); const language = await (new LanguagePicker().findLanguage("javascript")); - const tokenizer = await language.createTokenizer({ excludeComments: false }); + const tokenizer = await language.createTokenizer(); t.truthy(tokenizer); const { tokens } = tokenizer.tokenizeFile(file); @@ -117,7 +117,7 @@ test("should be able to parse larger files", async t => { test("should be able to correctly tokenize a variable", async t => { const file = new File("long.js", "var test = 1;"); const language = await (new LanguagePicker().findLanguage("javascript")); - const tokenizer = await language.createTokenizer({ excludeComments: false }); + const tokenizer = await language.createTokenizer(); const { tokens, mapping } = tokenizer.tokenizeFile(file); t.is(tokens.join(""), "(program(variable_declaration(variable_declarator(identifier)(number))))"); @@ -145,7 +145,7 @@ test("should be able to correctly tokenize a loop", async t => { const file = new File("long.js", "let i = 0;\nwhile (i < 10) {\n i += 1;\n}"); const language = await (new LanguagePicker().findLanguage("javascript")); - const tokenizer = await language.createTokenizer({ excludeComments: false }); + const tokenizer = await language.createTokenizer(); const { tokens, mapping } = tokenizer.tokenizeFile(file); t.is(tokens.join(""), "(program(lexical_declaration(variable_declarator(identifier)(number)))" + "(while_statement(parenthesized_expression(binary_expression(identifier)(number)))" +