Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[new model] Identify spam comments #3994

Open
wants to merge 67 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
67 commits
Select commit Hold shift + click to select a range
efef0bc
Create spamcomment model
jpangas Jan 12, 2024
bff58c5
Add New Features
jpangas Jan 15, 2024
48871dc
Merge remote-tracking branch 'upstream/master' into spamcom
jpangas Jan 16, 2024
61b0fe0
Include new features and change spamcom
jpangas Jan 18, 2024
e31fa75
Version 0.0.534
suhaibmujahid Jan 19, 2024
5103030
Merge remote-tracking branch 'upstream/master' into spamcom
jpangas Jan 19, 2024
a69cc54
Merge remote-tracking branch 'upstream/master' into spamcom
jpangas Jan 22, 2024
d365ad3
Create comments extractor
jpangas Jan 23, 2024
9ce864a
Remove comment features from Bug Features
jpangas Jan 23, 2024
77d534d
Add New features
jpangas Jan 24, 2024
73f74a4
Refine Link feature
jpangas Jan 25, 2024
2d65489
Test with TomekLinks
jpangas Jan 29, 2024
501a89f
Change df in text vectorizer
jpangas Jan 29, 2024
606f743
Use oversampling
jpangas Feb 2, 2024
41a73cb
Use max_step
jpangas Feb 6, 2024
586576d
Include and Refine features
jpangas Feb 7, 2024
ba7a1a1
Split Date Features
jpangas Feb 9, 2024
8f429d1
Rename features correctly
jpangas Feb 9, 2024
1ef2493
Remove Commenter Experience and Invalid Bugs
jpangas Feb 12, 2024
5a18517
Remove first comment
jpangas Feb 13, 2024
ea6c168
Include Links Dictionary
Feb 15, 2024
874b19f
Fix Error and Lint
jpangas Feb 15, 2024
b3da2e5
Refactor the Links Dictionary
jpangas Feb 15, 2024
b49485d
Use List instead
jpangas Feb 15, 2024
71fe950
Merge remote-tracking branch 'origin/master' into spamcom
jpangas Feb 16, 2024
4626064
Merge remote-tracking branch 'origin/spamcom' into spamcom
jpangas Feb 16, 2024
a7044b0
Use Dictionary for # of links
jpangas Feb 16, 2024
13772c7
Include older bugs
jpangas Feb 19, 2024
7cf0dcd
Replace Weekday with Weekend
jpangas Feb 19, 2024
cc8e6f6
Include max_delta_step
jpangas Feb 20, 2024
c4e4f22
Revert "Include max_delta_step"
jpangas Feb 20, 2024
01cca1e
Test using scale_pos_weight
jpangas Feb 20, 2024
cc42dee
Use URL Extract
jpangas Feb 20, 2024
4b8cf49
Revert to Using Regex
jpangas Feb 21, 2024
5c5da8c
Introduce new extraction func and features
jpangas Feb 22, 2024
dc16331
Include tests for extraction function
jpangas Feb 22, 2024
e5b0349
Change scale_pos_weight value
jpangas Feb 22, 2024
644795a
Change regex for extraction
jpangas Feb 22, 2024
45097da
Include tld_extract library
jpangas Feb 22, 2024
0a06ea3
Test without scale_pos_weight
jpangas Feb 22, 2024
e193764
Test with n_estimators changed
jpangas Feb 22, 2024
dda9b95
Test with GridSearch CV Values
jpangas Feb 23, 2024
5ba0c22
Remove scale_pos_weight from model.py
jpangas Feb 23, 2024
ca16b98
Set n_estimators to 1000
jpangas Feb 23, 2024
18d18f0
Revert "Remove scale_pos_weight from model.py"
jpangas Feb 23, 2024
1d35968
Remove comments which have 'redacted-
jpangas Feb 23, 2024
0a21b61
Test with new parameters
jpangas Feb 25, 2024
00a9f9f
Change df
jpangas Feb 25, 2024
f55d137
Test: Include tags as feature
jpangas Feb 26, 2024
dbcb311
Exclude comment tags
jpangas Feb 26, 2024
1b437da
Exclude emails from commit authors
jpangas Feb 27, 2024
16e14c5
Test without scale pos weight
jpangas Feb 27, 2024
94ab283
Test with scale_pos_weight adjusted
jpangas Feb 27, 2024
5a58108
Adjust scale pos weight
jpangas Feb 28, 2024
3eab988
Test wihout WeekOfYear
jpangas Mar 1, 2024
bd16d56
Include comment classifier
jpangas Mar 6, 2024
0a11f3c
Include script in setup
jpangas Mar 6, 2024
a3956b4
Fix script error
jpangas Mar 6, 2024
5c93d23
Fix setup error
jpangas Mar 6, 2024
15c8d5a
Classify all comments
jpangas Mar 7, 2024
5f953ac
Include spamcom in model names
jpangas Mar 13, 2024
df77a40
Merge remote-tracking branch 'upstream/master' into spamcom
jpangas Mar 13, 2024
4cd6c6d
Merge branch 'mozilla:master' into spamcom
jpangas Mar 13, 2024
4237f8f
Remove comment independent files
jpangas Mar 14, 2024
ba2ece2
Merge remote-tracking branch 'origin/spamcom' into spamcom
jpangas Mar 14, 2024
5490d01
Use(bug,comment) tuple
jpangas Mar 26, 2024
d95852d
Include BugvsCreator Feature
jpangas Apr 2, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
136 changes: 136 additions & 0 deletions bugbug/comment_features.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
# -*- coding: utf-8 -*-
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.

import re
import sys
from collections import defaultdict
from typing import Any

import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin


class CommentFeature(object):
pass


class CommentExtractor(BaseEstimator, TransformerMixin):
def __init__(
self,
feature_extractors,
cleanup_functions,
):
assert len(set(type(fe) for fe in feature_extractors)) == len(
feature_extractors
), "Duplicate Feature Extractors"
self.feature_extractors = feature_extractors

assert len(set(type(cf) for cf in cleanup_functions)) == len(
cleanup_functions
), "Duplicate Cleanup Functions"
self.cleanup_functions = cleanup_functions

def fit(self, x, y=None):
for feature in self.feature_extractors:
if hasattr(feature, "fit"):
feature.fit(x())

return self

def transform(self, comments):
comments_iter = iter(comments())

commenter_experience_map = defaultdict(int)

def apply_transform(comment):
data = {}

for feature_extractor in self.feature_extractors:
res = feature_extractor(
comment,
commenter_experience=commenter_experience_map[comment["creator"]],
)

if hasattr(feature_extractor, "name"):
feature_extractor_name = feature_extractor.name
else:
feature_extractor_name = feature_extractor.__class__.__name__

if res is None:
continue

if isinstance(res, (list, set)):
for item in res:
data[sys.intern(f"{item} in {feature_extractor_name}")] = True
continue

data[feature_extractor_name] = res

commenter_experience_map[comment["creator"]] += 1

comment_text = comment["text"]
for cleanup_function in self.cleanup_functions:
comment_text = cleanup_function(comment_text)

return {
"data": data,
"comment_text": comment_text,
}

return pd.DataFrame(apply_transform(comment) for comment in comments_iter)


class CommenterExperience(CommentFeature):
jpangas marked this conversation as resolved.
Show resolved Hide resolved
name = "# of Comments made by Commenter in the past"

def __call__(self, comment, commenter_experience, **kwargs):
return commenter_experience


class CommentHasLink(CommentFeature):
name = "Comment Has a Link"

# We check for links that are not from Mozilla
url_pattern = re.compile(r"http[s]?://(?!mozilla\.org|mozilla\.com)\S+")

def __call__(self, comment, **kwargs) -> Any:
return bool(self.url_pattern.search(comment["text"]))


class LengthofComment(CommentFeature):
name = "Length of Comment"

def __call__(self, comment, **kwargs):
return len(comment["text"])


class TimeCommentWasPosted(CommentFeature):
name = "Time Comment Was Posted"

def __call__(self, comment, **kwargs):
pass


class TimeDifferenceCommentAccountCreation(CommentFeature):
name = "Time Difference Between Account Creation and when Comment was Made "

def __call__(self, comment, account_creation_time, **kwargs):
pass


class CommentTags(CommentFeature):
name = "Comment Tags"

def __init__(self, to_ignore=set()):
self.to_ignore = to_ignore

def __call__(self, comment, **kwargs):
tags = []
for tag in comment["tags"]:
if tag in self.to_ignore:
continue

tags.append(tag)
return tags
17 changes: 17 additions & 0 deletions bugbug/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,8 @@ def get_human_readable_feature_names(self):
feature_name = f"Comments contain '{feature_name}'"
elif type_ == "text":
feature_name = f"Combined text contains '{feature_name}'"
elif type_ == "comment_text":
feature_name = f"Comment text contains '{feature_name}'"
elif type_ == "files":
feature_name = f"File '{feature_name}'"
elif type_ not in ("data", "couple_data"):
Expand Down Expand Up @@ -803,3 +805,18 @@ def items_gen(self, classes):
continue

yield issue, classes[issue_number]


class CommentModel(Model):
def __init__(self, lemmatization=False):
Model.__init__(self, lemmatization)
self.training_dbs = [bugzilla.BUGS_DB]

def items_gen(self, classes):
for bug in bugzilla.get_bugs():
for comment in bug["comments"]:
comment_id = comment["id"]
if comment["id"] not in classes:
continue

yield comment, classes[comment_id]
1 change: 1 addition & 0 deletions bugbug/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
"regressionrange": "bugbug.models.regressionrange.RegressionRangeModel",
"regressor": "bugbug.models.regressor.RegressorModel",
"spambug": "bugbug.models.spambug.SpamBugModel",
"spamcomment": "bugbug.models.spamcomment.SpamCommentModel",
"stepstoreproduce": "bugbug.models.stepstoreproduce.StepsToReproduceModel",
"testlabelselect": "bugbug.models.testselect.TestLabelSelectModel",
"testgroupselect": "bugbug.models.testselect.TestGroupSelectModel",
Expand Down
127 changes: 127 additions & 0 deletions bugbug/models/spamcomment.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
# -*- coding: utf-8 -*-
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.

import logging

import xgboost
from imblearn.pipeline import Pipeline as ImblearnPipeline
from imblearn.under_sampling import RandomUnderSampler
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline

from bugbug import bugzilla, comment_features, feature_cleanup, utils
from bugbug.model import CommentModel

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


class SpamCommentModel(CommentModel):
def __init__(self, lemmatization=True):
CommentModel.__init__(self, lemmatization)

self.calculate_importance = False

feature_extractors = [
comment_features.CommenterExperience(),
comment_features.CommentHasLink(),
comment_features.LengthofComment(),
]

cleanup_functions = [
feature_cleanup.fileref(),
feature_cleanup.url(),
feature_cleanup.synonyms(),
]

self.extraction_pipeline = Pipeline(
[
(
"comment_extractor",
comment_features.CommentExtractor(
feature_extractors, cleanup_functions
),
),
]
)

self.clf = ImblearnPipeline(
[
(
"union",
ColumnTransformer(
[
("data", DictVectorizer(), "data"),
(
"comment_text",
self.text_vectorizer(min_df=0.001),
"comment_text",
),
]
),
),
(
"sampler",
RandomUnderSampler(
random_state=0, sampling_strategy="not minority"
),
),
(
"estimator",
xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count()),
),
]
)

def get_labels(self):
classes = {}

for bug in bugzilla.get_bugs(include_invalid=True):
for comment in bug["comments"]:
comment_id = comment["id"]

# Skip comments filed by Mozillians and bots, since we are sure they are not spam.
if "@mozilla" in comment["creator"]:
continue

if "spam" in comment["tags"]:
classes[comment_id] = 1
else:
classes[comment_id] = 0

logger.info(
"%d comments are classified as non-spam",
sum(label == 0 for label in classes.values()),
)
logger.info(
"%d comments are classified as spam",
sum(label == 1 for label in classes.values()),
)

return classes, [0, 1]

def items_gen(self, classes):
# Overwriting this method to add include_invalid=True to get_bugs to
# include spam bugs which have a number of spam comments.
return (
(comment, classes[comment["id"]])
for bug in bugzilla.get_bugs(include_invalid=True)
for comment in bug["comments"]
if comment["id"] in classes
)

def get_feature_names(self):
return self.clf.named_steps["union"].get_feature_names_out()

def overwrite_classes(self, comments, classes, probabilities):
for i, comment in enumerate(comments):
if "@mozilla" in comment["creator"]:
if probabilities:
classes[i] = [1.0, 0.0]
else:
classes[i] = 0

return classes