Dev 003 (#14)

- fixing percentage_outliers (was still there) - adding missingness for graphs - adding leet speak
Fraunhofer-IESE · Jul 13, 2023 · 83bb5e0 · 83bb5e0
2 parents 5ae79da + e14a8fc
commit 83bb5e0
Show file tree

Hide file tree

Showing 9 changed files with 347 additions and 10 deletions.
diff --git a/badgers/generators/graph/missingness.py b/badgers/generators/graph/missingness.py
@@ -1,13 +1,16 @@
 import abc
+from copy import copy
 from typing import Tuple
 
+import networkx as nx
 import numpy
+import numpy as np
 from numpy.random import default_rng
 
 from badgers.core.base import GeneratorMixin
 
 
-class MissingNodesGenerator(GeneratorMixin):
+class MissingGenerator(GeneratorMixin):
     """
     Base class for missing nodes transformer
     """
@@ -25,3 +28,65 @@ def __init__(self, percentage_missing: int = 10, random_generator: numpy.random.
     @abc.abstractmethod
     def generate(self, X, y=None, **params) -> Tuple:
         pass
+
+
+class NodesMissingCompletelyAtRandom(MissingGenerator):
+    """
+    Removes nodes from the graph uniformly at random.
+    """
+
+    def __init__(self, percentage_missing: int = 10, random_generator: numpy.random.Generator = default_rng(seed=0)):
+        super().__init__(percentage_missing=percentage_missing, random_generator=random_generator)
+
+    def generate(self, X, y=None, **params) -> Tuple:
+        if not isinstance(X, nx.Graph):
+            raise NotImplementedError('badgers does only support networkx.Graph objects for graphs')
+
+        nodes_to_be_removed = self.random_generator.choice(
+            X.nodes(),
+            int(X.number_of_nodes() * self.percentage_missing / 100),
+            replace=False
+        )
+
+        Xt = X.copy()
+        Xt.remove_nodes_from(nodes_to_be_removed)
+
+        if y is not None:
+            yt = np.delete(y, nodes_to_be_removed)
+        else:
+            yt = None
+
+        return Xt, yt
+
+
+class EdgesMissingCompletelyAtRandom(MissingGenerator):
+    """
+    Removes edges from the graph uniformly at random.
+    """
+
+    def __init__(self, percentage_missing: int = 10, random_generator: numpy.random.Generator = default_rng(seed=0)):
+        super().__init__(percentage_missing=percentage_missing, random_generator=random_generator)
+
+    def generate(self, X, y=None, **params) -> Tuple:
+        if not isinstance(X, nx.Graph):
+            raise NotImplementedError('badgers does only support networkx.Graph objects for graphs')
+
+        edges_to_be_removed = self.random_generator.choice(
+            X.edges(),
+            int(X.number_of_edges() * self.percentage_missing / 100),
+            replace=False
+        )
+
+        Xt = X.copy()
+        Xt.remove_edges_from(edges_to_be_removed)
+
+        if y is None:
+            yt = None
+        elif isinstance(y, dict):
+            yt = copy(y)
+            for e in edges_to_be_removed:
+                del yt[e]
+        else:
+            raise ValueError(f'This type of y is not supported {type(y)}, {y}')
+
+        return Xt, yt
diff --git a/badgers/generators/text/typos.py b/badgers/generators/text/typos.py
@@ -58,4 +58,68 @@ def generate(self, X, y, **params) -> Tuple:
                 # save the word with switched letters as string
                 X[i] = ''.join(word)
 
-        return X, None
+        return X, y
+
+
+class LeetSpeakGenerator(TyposGenerator):
+
+    def __init__(self, random_generator=default_rng(seed=0), replacement_proba: float = 0.1):
+        """
+
+        :param random_generator: a random number generator
+        :param replacement_proba: the probability of replacing a letter with its leet counterpart
+        """
+        super().__init__(random_generator=random_generator)
+        assert 0 <= replacement_proba <= 1
+        self.replacement_proba = replacement_proba
+        self.leet_speak_mapping = {
+            "A": ["4", "/\\", "@", "/-\\", "^", "(L", "\u0414"],
+            "B": ["I3", "8", "13", "|3", "\u00df", "!3", "(3", "/3", ")3", "|-]", "j3"],
+            "C": ["[", "\u00a2", "<", "(", "\u00a9"],
+            "D": [")", "|)", "(|", "[)", "I>", "|>", "?", "T)", "I7", "cl", "|}", "|]"],
+            "E": ["3", "&", "\u00a3", "\u20ac", "[-", "|=-"],
+            "F": ["|=", "\u0192", "|#", "ph", "/=", "v"],
+            "G": ["6", "&", "(_+", "9", "C-", "gee", "(?,", "[,", "{,", "<-", "(."],
+            "H": ["#", "/-/", "\\-\\", "[-]", "]-[", ")-(", "(-)", ":-:", "|~|", "|-|", "]~[", "}{", "!-!", "1-1",
+                  "\\-/", "I+I", "?"],
+            "I": ["1", "|", "][", "!", "eye", "3y3"],
+            "J": [",_|", "_|", "._|", "._]", "_]", ",_]", "]"],
+            "K": [">|", "|<", "1<", "|c", "|(7<"],
+            "L": ["1", "2", "\u00a3", "7", "|_", "|"],
+            "M": ["/\\/\\", "/V\\", "[V]", "|\\/|", "^^", "<\\/>", "{V}", "(v)", "(V)", "|\\|\\", "]\\/[", "nn", "11"],
+            "N": ["^/", "|\\|", "/\\/", "[\\]", "<\\>", "{\\}", "/V", "^", "\u0e17", "\u0418"],
+            "O": ["0", "()", "oh", "[]", "p", "<>", "\u00d8"],
+            "P": ["|*", "|o", "|\u00ba", "?", "|^", "|>", "|\"", "9", "[]D", "|\u00b0", "|7"],
+            "Q": ["(_,)", "()_", "2", "0_", "<|", "&", "9", "\u00b6", "\u204b", "\u2117"],
+            "R": ["I2", "9", "|`", "|~", "|?", "/2", "|^", "lz", "7", "2", "12", "\u00ae", "[z", "\u042f", ".-", "|2",
+                  "|-", "3"],
+            "S": ["5", "$", "z", "\u00a7", "ehs", "es", "2"],
+            "T": ["7", "+", "-|-", "']['", "\u2020", "\u00ab|\u00bb", "~|~"],
+            "U": ["(_)", "|_|", "v", "L|", "\u0e1a"],
+            "V": ["\\/", "|/", "\\|"],
+            "W": ["\\/\\/", "vv", "\\N", "'//", "\\\\'", "\\^/", "\\/\\/", "(n)", "\\V/", "\\X/", "\\|/", "\\_|_/",
+                  "\\_:_/", "uu", "2u", "\\\\//\\\\//", "\u0e1e", "\u20a9"],
+            "X": ["><", "}{", "ecks", "\u00d7", "?", "}{", ")(", "]["],
+            "Y": ["j", "`/", "\\|/", "\u00a5", "\\//"],
+            "Z": ["2", "7_", "-/_", "%", ">_", "s", "~/_", "-\\_", "-|_"]
+        }
+
+    def randomly_replace_letter(self, letter):
+        """
+        Randomly replace a letter with its leet counterpart
+        :param letter:
+        :return:
+        """
+        if self.random_generator.random() < self.replacement_proba:
+            return self.random_generator.choice(self.leet_speak_mapping[letter.upper()])
+        else:
+            return letter
+
+    def generate(self, X, y, **params) -> Tuple:
+
+        Xt = [
+            ''.join([self.randomly_replace_letter(l) for l in word])
+            for word in X
+        ]
+
+        return Xt, y
diff --git a/docs/about.md b/docs/about.md
@@ -7,8 +7,21 @@ The original idea came during internal discussions with Patricia Kelbert, Adam T
 A big thanks to Daniel Seifert for taking the time to investigate and automate all the many things that make our lives as package contributors easier.
 
 ## Citing Badgers
-
-Work In Progess
+
+If you use [badgers](https://github.com/Fraunhofer-IESE/badgers) in scientific publications, you can cite the following paper:
+
+Julien Siebert, Daniel Seifert, Patricia Kelbert, Michael Kläs, Adam Trendowicz (2023). Badgers: generating data quality deficits with Python. https://arxiv.org/abs/2307.04468
+
+```
+@misc{siebert2023badgers,
+      title={Badgers: generating data quality deficits with Python}, 
+      author={Julien Siebert and Daniel Seifert and Patricia Kelbert and Michael Kläs and Adam Trendowicz},
+      year={2023},
+      eprint={2307.04468},
+      archivePrefix={arXiv},
+      primaryClass={cs.LG}
+}
+```
 
 ## Contributing to Badgers
 

diff --git a/docs/tutorials/Typos-Text.ipynb b/docs/tutorials/Typos-Text.ipynb
@@ -0,0 +1,149 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "c695b94a-1032-46a3-9a97-421f1a4a43bd",
+   "metadata": {},
+   "source": [
+    "# Adding typos to text"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "d940b272-c15a-4b3b-b068-bc7308599534",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from badgers.generators.text.typos import SwapLettersGenerator, LeetSpeakGenerator"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "8d8acaee-95ca-4ac1-910d-6b79256564a7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X = \"the quick brown fox jumps over the lazy dog\".split(' ')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4da6eb3c-cd91-41ad-9db8-6a891bad6072",
+   "metadata": {},
+   "source": [
+    "## Swapping letter randomly"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "196be517-21ee-4f50-8a3c-57b454fcc470",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "swap_letters = SwapLettersGenerator(swap_proba=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "bdbb18f3-29fd-48e0-8450-8de9266dd75a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "Xt, _ = swap_letters.generate(X.copy(), None)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "4a2c09cb-9a50-474e-be2f-a9a8a9362748",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']\n",
+      "['the', 'qucik', 'borwn', 'fox', 'jmups', 'oevr', 'the', 'lzay', 'dog']\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(X)\n",
+    "print(Xt)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d1c389f2-5ce5-4fe6-89d0-f8002949113e",
+   "metadata": {},
+   "source": [
+    "## Leet Speak"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "43e634b8-2657-4e28-ac59-fa190298937d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "leet_speak = LeetSpeakGenerator(replacement_proba=0.1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "0b6f7e67-e68a-446c-a8e7-25823895ea0f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "Xt, _ = leet_speak.generate(X.copy(), None)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "04d104f6-da91-4e50-981a-72b703903a6a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']\n",
+      "['th3', 'quick', 'br0w^', 'fox', 'ju|\\\\|\\\\ps', 'over', 'the', 'lazy', 'dog']\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(X)\n",
+    "print(Xt)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -14,6 +14,8 @@ nav:
                     -   Generating Missing Data: tutorials/Missingness-Tabular-Data.ipynb
                     -   Generating Noise: tutorials/Noise-Tabular-Data.ipynb
                     -   Generating Outliers: tutorials/Outliers-Tabular-Data.ipynb
+            -   Text Data:
+                    -   Generating Typos: tutorials/Typos-Text.ipynb
             -   Pipelines:
                     -   Chaining Generators: tutorials/Pipeline-Tabular-Data.ipynb
     -   API: reference/

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "badgers"
-version = "0.0.2"
+version = "0.0.3"
 keywords = ["data quality", "bad data", "data science"]
 authors = [
     { name = "Julien Siebert", email = "[email protected]" },
@@ -15,17 +15,18 @@ maintainers = [
 description = "Badgers: bad data generators"
 readme = "README.md"
 requires-python = ">=3.8"
-license = {file = "LICENSE"}
+license = { file = "LICENSE" }
 classifiers = [
     "Programming Language :: Python :: 3",
     "License :: OSI Approved :: BSD License",
     "Operating System :: OS Independent",
     "Development Status :: 1 - Planning",
 ]
 dependencies = [
-  "numpy",
-  "pandas",
-  "scikit-learn",
+    "numpy",
+    "pandas",
+    "scikit-learn",
+    "networkx"
 ]
 [project.urls]
 "Homepage" = "https://github.com/Fraunhofer-IESE/badgers"

diff --git a/tests/generators/graph/__init__.py b/tests/generators/graph/__init__.py