mozilla · jpangas · Jan 12, 2024 · Jan 15, 2024 · Jan 16, 2024 · Jan 18, 2024
diff --git a/bugbug/comment_features.py b/bugbug/comment_features.py
@@ -0,0 +1,136 @@
+# -*- coding: utf-8 -*-
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this file,
+# You can obtain one at http://mozilla.org/MPL/2.0/.
+
+import re
+import sys
+from collections import defaultdict
+from typing import Any
+
+import pandas as pd
+from sklearn.base import BaseEstimator, TransformerMixin
+
+
+class CommentFeature(object):
+    pass
+
+
+class CommentExtractor(BaseEstimator, TransformerMixin):
+    def __init__(
+        self,
+        feature_extractors,
+        cleanup_functions,
+    ):
+        assert len(set(type(fe) for fe in feature_extractors)) == len(
+            feature_extractors
+        ), "Duplicate Feature Extractors"
+        self.feature_extractors = feature_extractors
+
+        assert len(set(type(cf) for cf in cleanup_functions)) == len(
+            cleanup_functions
+        ), "Duplicate Cleanup Functions"
+        self.cleanup_functions = cleanup_functions
+
+    def fit(self, x, y=None):
+        for feature in self.feature_extractors:
+            if hasattr(feature, "fit"):
+                feature.fit(x())
+
+        return self
+
+    def transform(self, comments):
+        comments_iter = iter(comments())
+
+        commenter_experience_map = defaultdict(int)
+
+        def apply_transform(comment):
+            data = {}
+
+            for feature_extractor in self.feature_extractors:
+                res = feature_extractor(
+                    comment,
+                    commenter_experience=commenter_experience_map[comment["creator"]],
+                )
+
+                if hasattr(feature_extractor, "name"):
+                    feature_extractor_name = feature_extractor.name
+                else:
+                    feature_extractor_name = feature_extractor.__class__.__name__
+
+                if res is None:
+                    continue
+
+                if isinstance(res, (list, set)):
+                    for item in res:
+                        data[sys.intern(f"{item} in {feature_extractor_name}")] = True
+                    continue
+
+                data[feature_extractor_name] = res
+
+            commenter_experience_map[comment["creator"]] += 1
+
+            comment_text = comment["text"]
+            for cleanup_function in self.cleanup_functions:
+                comment_text = cleanup_function(comment_text)
+
+            return {
+                "data": data,
+                "comment_text": comment_text,
+            }
+
+        return pd.DataFrame(apply_transform(comment) for comment in comments_iter)
+
+
+class CommenterExperience(CommentFeature):
+    name = "# of Comments made by Commenter in the past"
+
+    def __call__(self, comment, commenter_experience, **kwargs):
+        return commenter_experience
+
+
+class CommentHasLink(CommentFeature):
+    name = "Comment Has a Link"
+
+    # We check for links that are not from Mozilla
+    url_pattern = re.compile(r"http[s]?://(?!mozilla\.org|mozilla\.com)\S+")
+
+    def __call__(self, comment, **kwargs) -> Any:
+        return bool(self.url_pattern.search(comment["text"]))
+
+
+class LengthofComment(CommentFeature):
+    name = "Length of Comment"
+
+    def __call__(self, comment, **kwargs):
+        return len(comment["text"])
+
+
+class TimeCommentWasPosted(CommentFeature):
+    name = "Time Comment Was Posted"
+
+    def __call__(self, comment, **kwargs):
+        pass
+
+
+class TimeDifferenceCommentAccountCreation(CommentFeature):
+    name = "Time Difference Between Account Creation and when Comment was Made "
+
+    def __call__(self, comment, account_creation_time, **kwargs):
+        pass
+
+
+class CommentTags(CommentFeature):
+    name = "Comment Tags"
+
+    def __init__(self, to_ignore=set()):
+        self.to_ignore = to_ignore
+
+    def __call__(self, comment, **kwargs):
+        tags = []
+        for tag in comment["tags"]:
+            if tag in self.to_ignore:
+                continue
+
+            tags.append(tag)
+        return tags
diff --git a/bugbug/model.py b/bugbug/model.py
@@ -216,6 +216,8 @@ def get_human_readable_feature_names(self):
                 feature_name = f"Comments contain '{feature_name}'"
             elif type_ == "text":
                 feature_name = f"Combined text contains '{feature_name}'"
+            elif type_ == "comment_text":
+                feature_name = f"Comment text contains '{feature_name}'"
             elif type_ == "files":
                 feature_name = f"File '{feature_name}'"
             elif type_ not in ("data", "couple_data"):
@@ -803,3 +805,18 @@ def items_gen(self, classes):
                 continue
 
             yield issue, classes[issue_number]
+
+
+class CommentModel(Model):
+    def __init__(self, lemmatization=False):
+        Model.__init__(self, lemmatization)
+        self.training_dbs = [bugzilla.BUGS_DB]
+
+    def items_gen(self, classes):
+        for bug in bugzilla.get_bugs():
+            for comment in bug["comments"]:
+                comment_id = comment["id"]
+                if comment["id"] not in classes:
+                    continue
+
+            yield comment, classes[comment_id]
diff --git a/bugbug/models/__init__.py b/bugbug/models/__init__.py
@@ -29,6 +29,7 @@
     "regressionrange": "bugbug.models.regressionrange.RegressionRangeModel",
     "regressor": "bugbug.models.regressor.RegressorModel",
     "spambug": "bugbug.models.spambug.SpamBugModel",
+    "spamcomment": "bugbug.models.spamcomment.SpamCommentModel",
     "stepstoreproduce": "bugbug.models.stepstoreproduce.StepsToReproduceModel",
     "testlabelselect": "bugbug.models.testselect.TestLabelSelectModel",
     "testgroupselect": "bugbug.models.testselect.TestGroupSelectModel",

diff --git a/bugbug/models/spamcomment.py b/bugbug/models/spamcomment.py
@@ -0,0 +1,127 @@
+# -*- coding: utf-8 -*-
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this file,
+# You can obtain one at http://mozilla.org/MPL/2.0/.
+
+import logging
+
+import xgboost
+from imblearn.pipeline import Pipeline as ImblearnPipeline
+from imblearn.under_sampling import RandomUnderSampler
+from sklearn.compose import ColumnTransformer
+from sklearn.feature_extraction import DictVectorizer
+from sklearn.pipeline import Pipeline
+
+from bugbug import bugzilla, comment_features, feature_cleanup, utils
+from bugbug.model import CommentModel
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+class SpamCommentModel(CommentModel):
+    def __init__(self, lemmatization=True):
+        CommentModel.__init__(self, lemmatization)
+
+        self.calculate_importance = False
+
+        feature_extractors = [
+            comment_features.CommenterExperience(),
+            comment_features.CommentHasLink(),
+            comment_features.LengthofComment(),
+        ]
+
+        cleanup_functions = [
+            feature_cleanup.fileref(),
+            feature_cleanup.url(),
+            feature_cleanup.synonyms(),
+        ]
+
+        self.extraction_pipeline = Pipeline(
+            [
+                (
+                    "comment_extractor",
+                    comment_features.CommentExtractor(
+                        feature_extractors, cleanup_functions
+                    ),
+                ),
+            ]
+        )
+
+        self.clf = ImblearnPipeline(
+            [
+                (
+                    "union",
+                    ColumnTransformer(
+                        [
+                            ("data", DictVectorizer(), "data"),
+                            (
+                                "comment_text",
+                                self.text_vectorizer(min_df=0.001),
+                                "comment_text",
+                            ),
+                        ]
+                    ),
+                ),
+                (
+                    "sampler",
+                    RandomUnderSampler(
+                        random_state=0, sampling_strategy="not minority"
+                    ),
+                ),
+                (
+                    "estimator",
+                    xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count()),
+                ),
+            ]
+        )
+
+    def get_labels(self):
+        classes = {}
+
+        for bug in bugzilla.get_bugs(include_invalid=True):
+            for comment in bug["comments"]:
+                comment_id = comment["id"]
+
+                # Skip comments filed by Mozillians and bots, since we are sure they are not spam.
+                if "@mozilla" in comment["creator"]:
+                    continue
+
+                if "spam" in comment["tags"]:
+                    classes[comment_id] = 1
+                else:
+                    classes[comment_id] = 0
+
+        logger.info(
+            "%d comments are classified as non-spam",
+            sum(label == 0 for label in classes.values()),
+        )
+        logger.info(
+            "%d comments are classified as spam",
+            sum(label == 1 for label in classes.values()),
+        )
+
+        return classes, [0, 1]
+
+    def items_gen(self, classes):
+        # Overwriting this method to add include_invalid=True to get_bugs to
+        # include spam bugs which have a number of spam comments.
+        return (
+            (comment, classes[comment["id"]])
+            for bug in bugzilla.get_bugs(include_invalid=True)
+            for comment in bug["comments"]
+            if comment["id"] in classes
+        )
+
+    def get_feature_names(self):
+        return self.clf.named_steps["union"].get_feature_names_out()
+
+    def overwrite_classes(self, comments, classes, probabilities):
+        for i, comment in enumerate(comments):
+            if "@mozilla" in comment["creator"]:
+                if probabilities:
+                    classes[i] = [1.0, 0.0]
+                else:
+                    classes[i] = 0
+
+        return classes