Skip to content

Commit

Permalink
Merge pull request #1645 from dodona-edu/exclude-comments
Browse files Browse the repository at this point in the history
Exclude the comments in the tokenization step
  • Loading branch information
rien authored Nov 20, 2024
2 parents 1d13eb5 + 891d255 commit a8ac373
Show file tree
Hide file tree
Showing 9 changed files with 48 additions and 127 deletions.
7 changes: 7 additions & 0 deletions cli/src/cli/commands/run.ts
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,12 @@ export function runCommand(program: Command): Command {
x => parseFloat(x),
Options.defaultKgramsInWindow
)
.option(
"-C, --include-comments",
Utils.indent(
"Include the comments during the tokenization process."
)
)
.action(async (locations, options) => run(locations, { ...options , ...program.opts() }));
}

Expand Down Expand Up @@ -206,6 +212,7 @@ export async function run(locations: string[], options: RunOptions): Promise<voi
limitResults: options.limitResults,
sortBy: options.sortBy,
fragmentSortBy: options.fragmentSortBy,
includeComments: options.includeComments
});
const report = await dolos.analyzePaths(locations, options.ignore);

Expand Down
2 changes: 1 addition & 1 deletion lib/src/lib/dolos.ts
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ export class Dolos {
this.language = this.languagePicker.detectLanguage(files);
this.languageDetected = true;
}
this.tokenizer = await this.language.createTokenizer();
this.tokenizer = await this.language.createTokenizer({ includeComments: this.options.includeComments });
this.index = new FingerprintIndex(this.options.kgramLength, this.options.kgramsInWindow, this.options.kgramData);
}
const warnings = [];
Expand Down
18 changes: 9 additions & 9 deletions lib/src/lib/language.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/* eslint-disable @typescript-eslint/ban-ts-comment */
import { Tokenizer } from "./tokenizer/tokenizer.js";
import { Tokenizer, TokenizerOptions } from "./tokenizer/tokenizer.js";
import { File } from "@dodona/dolos-core";

// eslint-disable-next-line @typescript-eslint/no-explicit-any
Expand All @@ -25,7 +25,7 @@ export abstract class Language {
}
}

public abstract createTokenizer(): Promise<Tokenizer>;
public abstract createTokenizer(options?: TokenizerOptions): Promise<Tokenizer>;
}

export class ProgrammingLanguage extends Language {
Expand Down Expand Up @@ -54,10 +54,10 @@ export class ProgrammingLanguage extends Language {
return this.languageModule;
}

async createTokenizer(): Promise<Tokenizer> {
async createTokenizer(options?: TokenizerOptions): Promise<Tokenizer> {
const { CodeTokenizer } = await import ("./tokenizer/codeTokenizer.js");
await this.loadLanguageModule();
return new CodeTokenizer(this);
return new CodeTokenizer(this, options);
}
}

Expand Down Expand Up @@ -86,13 +86,13 @@ export class CustomTokenizerLanguage extends Language {
constructor(
readonly name: string,
readonly extensions: string[],
readonly customTokenizer: ((self: Language) => Promise<Tokenizer>)
readonly customTokenizer: ((self: Language, options?: TokenizerOptions) => Promise<Tokenizer>)
) {
super(name, extensions);
}

public async createTokenizer(): Promise<Tokenizer> {
return await this.customTokenizer(this);
public async createTokenizer(options?: TokenizerOptions): Promise<Tokenizer> {
return await this.customTokenizer(this, options);
}
}

Expand Down Expand Up @@ -127,9 +127,9 @@ export class LanguagePicker {
new ProgrammingLanguage("typescript", [".ts"]),
new ProgrammingLanguage("tsx", [".tsx"]),
new ProgrammingLanguage("verilog", [".v", ".vh"]),
new CustomTokenizerLanguage("char", [".txt", ".md"], async self => {
new CustomTokenizerLanguage("char", [".txt", ".md"], async (self, options) => {
const { CharTokenizer } = await import("./tokenizer/charTokenizer.js");
return new CharTokenizer(self);
return new CharTokenizer(self, options);
}),
];

Expand Down
5 changes: 5 additions & 0 deletions lib/src/lib/options.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ export interface DolosOptions {
sortBy: string | null;
fragmentSortBy: string | null;
kgramData: boolean;
includeComments: boolean;
}

export type CustomOptions = Partial<DolosOptions>;
Expand Down Expand Up @@ -87,6 +88,9 @@ export class Options implements DolosOptions {
return this.custom.kgramData == true;
}

get includeComments(): boolean {
return this.custom.includeComments === true;
}

get limitResults(): number | null {
return definedOrNull(this.custom.limitResults);
Expand Down Expand Up @@ -155,6 +159,7 @@ export class Options implements DolosOptions {
sortBy: this.sortBy,
fragmentSortBy: this.fragmentSortBy,
kgramData: this.kgramData,
includeComments: this.includeComments,
};
}

Expand Down
19 changes: 12 additions & 7 deletions lib/src/lib/tokenizer/codeTokenizer.ts
Original file line number Diff line number Diff line change
@@ -1,21 +1,21 @@
import { default as Parser, SyntaxNode } from "tree-sitter";
import { Region } from "@dodona/dolos-core";
import { Token, Tokenizer } from "./tokenizer.js";
import { Token, Tokenizer, TokenizerOptions } from "./tokenizer.js";
import { ProgrammingLanguage } from "../language.js";

export class CodeTokenizer extends Tokenizer {

private readonly parser: Parser;

/**
* Creates a new tokenizer of the given language. Will throw an error when the
* given language is not supported. See Tokenizer.supportedLanguages for a
* list of all supported languages.
*
* @param language The language to use for this tokenizer.
* @param options
*/
constructor(language: ProgrammingLanguage) {
super(language);
constructor(language: ProgrammingLanguage, options?: TokenizerOptions) {
super(language, options);
this.parser = new Parser();
this.parser.setLanguage(language.getLanguageModule());
}
Expand Down Expand Up @@ -62,8 +62,11 @@ export class CodeTokenizer extends Tokenizer {
node.endPosition.column
);

tokens.push(this.newToken("(", location));
tokens.push(this.newToken(node.type, location));
const includeToken = !node.type.includes("comment") || this.options.includeComments;
if (includeToken) {
tokens.push(this.newToken("(", location));
tokens.push(this.newToken(node.type, location));
}
for (const child of node.namedChildren) {

const [childStartRow, childStartCol] = this.tokenizeNode(child, tokens);
Expand All @@ -75,7 +78,9 @@ export class CodeTokenizer extends Tokenizer {
}
}

tokens.push(this.newToken(")", location));
if (includeToken) {
tokens.push(this.newToken(")", location));
}

// Also return the startRow and startCol, this can be used by the parent node.
return [location.startRow, location.startCol];
Expand Down
7 changes: 5 additions & 2 deletions lib/src/lib/tokenizer/tokenizer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,13 @@ export interface Token {
location: Region;
}

export abstract class Tokenizer {
export type TokenizerOptions = Partial<{
includeComments: boolean;
}>

constructor(public readonly language: Language) {}
export abstract class Tokenizer {

constructor(public readonly language: Language, protected readonly options: TokenizerOptions = {}) {}

/**
* Runs the parser on a given string. Returns a list of Tokens
Expand Down
Loading

0 comments on commit a8ac373

Please sign in to comment.