Skip to content

Commit

Permalink
Dev 003 (#14)
Browse files Browse the repository at this point in the history
- fixing percentage_outliers (was still there)
- adding missingness for graphs
- adding leet speak
  • Loading branch information
siebert-julien authored Jul 13, 2023
2 parents 5ae79da + e14a8fc commit 83bb5e0
Show file tree
Hide file tree
Showing 9 changed files with 347 additions and 10 deletions.
67 changes: 66 additions & 1 deletion badgers/generators/graph/missingness.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
import abc
from copy import copy
from typing import Tuple

import networkx as nx
import numpy
import numpy as np
from numpy.random import default_rng

from badgers.core.base import GeneratorMixin


class MissingNodesGenerator(GeneratorMixin):
class MissingGenerator(GeneratorMixin):
"""
Base class for missing nodes transformer
"""
Expand All @@ -25,3 +28,65 @@ def __init__(self, percentage_missing: int = 10, random_generator: numpy.random.
@abc.abstractmethod
def generate(self, X, y=None, **params) -> Tuple:
pass


class NodesMissingCompletelyAtRandom(MissingGenerator):
"""
Removes nodes from the graph uniformly at random.
"""

def __init__(self, percentage_missing: int = 10, random_generator: numpy.random.Generator = default_rng(seed=0)):
super().__init__(percentage_missing=percentage_missing, random_generator=random_generator)

def generate(self, X, y=None, **params) -> Tuple:
if not isinstance(X, nx.Graph):
raise NotImplementedError('badgers does only support networkx.Graph objects for graphs')

nodes_to_be_removed = self.random_generator.choice(
X.nodes(),
int(X.number_of_nodes() * self.percentage_missing / 100),
replace=False
)

Xt = X.copy()
Xt.remove_nodes_from(nodes_to_be_removed)

if y is not None:
yt = np.delete(y, nodes_to_be_removed)
else:
yt = None

return Xt, yt


class EdgesMissingCompletelyAtRandom(MissingGenerator):
"""
Removes edges from the graph uniformly at random.
"""

def __init__(self, percentage_missing: int = 10, random_generator: numpy.random.Generator = default_rng(seed=0)):
super().__init__(percentage_missing=percentage_missing, random_generator=random_generator)

def generate(self, X, y=None, **params) -> Tuple:
if not isinstance(X, nx.Graph):
raise NotImplementedError('badgers does only support networkx.Graph objects for graphs')

edges_to_be_removed = self.random_generator.choice(
X.edges(),
int(X.number_of_edges() * self.percentage_missing / 100),
replace=False
)

Xt = X.copy()
Xt.remove_edges_from(edges_to_be_removed)

if y is None:
yt = None
elif isinstance(y, dict):
yt = copy(y)
for e in edges_to_be_removed:
del yt[e]
else:
raise ValueError(f'This type of y is not supported {type(y)}, {y}')

return Xt, yt
66 changes: 65 additions & 1 deletion badgers/generators/text/typos.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,4 +58,68 @@ def generate(self, X, y, **params) -> Tuple:
# save the word with switched letters as string
X[i] = ''.join(word)

return X, None
return X, y


class LeetSpeakGenerator(TyposGenerator):

def __init__(self, random_generator=default_rng(seed=0), replacement_proba: float = 0.1):
"""
:param random_generator: a random number generator
:param replacement_proba: the probability of replacing a letter with its leet counterpart
"""
super().__init__(random_generator=random_generator)
assert 0 <= replacement_proba <= 1
self.replacement_proba = replacement_proba
self.leet_speak_mapping = {
"A": ["4", "/\\", "@", "/-\\", "^", "(L", "\u0414"],
"B": ["I3", "8", "13", "|3", "\u00df", "!3", "(3", "/3", ")3", "|-]", "j3"],
"C": ["[", "\u00a2", "<", "(", "\u00a9"],
"D": [")", "|)", "(|", "[)", "I>", "|>", "?", "T)", "I7", "cl", "|}", "|]"],
"E": ["3", "&", "\u00a3", "\u20ac", "[-", "|=-"],
"F": ["|=", "\u0192", "|#", "ph", "/=", "v"],
"G": ["6", "&", "(_+", "9", "C-", "gee", "(?,", "[,", "{,", "<-", "(."],
"H": ["#", "/-/", "\\-\\", "[-]", "]-[", ")-(", "(-)", ":-:", "|~|", "|-|", "]~[", "}{", "!-!", "1-1",
"\\-/", "I+I", "?"],
"I": ["1", "|", "][", "!", "eye", "3y3"],
"J": [",_|", "_|", "._|", "._]", "_]", ",_]", "]"],
"K": [">|", "|<", "1<", "|c", "|(7<"],
"L": ["1", "2", "\u00a3", "7", "|_", "|"],
"M": ["/\\/\\", "/V\\", "[V]", "|\\/|", "^^", "<\\/>", "{V}", "(v)", "(V)", "|\\|\\", "]\\/[", "nn", "11"],
"N": ["^/", "|\\|", "/\\/", "[\\]", "<\\>", "{\\}", "/V", "^", "\u0e17", "\u0418"],
"O": ["0", "()", "oh", "[]", "p", "<>", "\u00d8"],
"P": ["|*", "|o", "|\u00ba", "?", "|^", "|>", "|\"", "9", "[]D", "|\u00b0", "|7"],
"Q": ["(_,)", "()_", "2", "0_", "<|", "&", "9", "\u00b6", "\u204b", "\u2117"],
"R": ["I2", "9", "|`", "|~", "|?", "/2", "|^", "lz", "7", "2", "12", "\u00ae", "[z", "\u042f", ".-", "|2",
"|-", "3"],
"S": ["5", "$", "z", "\u00a7", "ehs", "es", "2"],
"T": ["7", "+", "-|-", "']['", "\u2020", "\u00ab|\u00bb", "~|~"],
"U": ["(_)", "|_|", "v", "L|", "\u0e1a"],
"V": ["\\/", "|/", "\\|"],
"W": ["\\/\\/", "vv", "\\N", "'//", "\\\\'", "\\^/", "\\/\\/", "(n)", "\\V/", "\\X/", "\\|/", "\\_|_/",
"\\_:_/", "uu", "2u", "\\\\//\\\\//", "\u0e1e", "\u20a9"],
"X": ["><", "}{", "ecks", "\u00d7", "?", "}{", ")(", "]["],
"Y": ["j", "`/", "\\|/", "\u00a5", "\\//"],
"Z": ["2", "7_", "-/_", "%", ">_", "s", "~/_", "-\\_", "-|_"]
}

def randomly_replace_letter(self, letter):
"""
Randomly replace a letter with its leet counterpart
:param letter:
:return:
"""
if self.random_generator.random() < self.replacement_proba:
return self.random_generator.choice(self.leet_speak_mapping[letter.upper()])
else:
return letter

def generate(self, X, y, **params) -> Tuple:

Xt = [
''.join([self.randomly_replace_letter(l) for l in word])
for word in X
]

return Xt, y
17 changes: 15 additions & 2 deletions docs/about.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,21 @@ The original idea came during internal discussions with Patricia Kelbert, Adam T
A big thanks to Daniel Seifert for taking the time to investigate and automate all the many things that make our lives as package contributors easier.

## Citing Badgers

Work In Progess

If you use [badgers](https://github.com/Fraunhofer-IESE/badgers) in scientific publications, you can cite the following paper:

Julien Siebert, Daniel Seifert, Patricia Kelbert, Michael Kläs, Adam Trendowicz (2023). Badgers: generating data quality deficits with Python. https://arxiv.org/abs/2307.04468

```
@misc{siebert2023badgers,
title={Badgers: generating data quality deficits with Python},
author={Julien Siebert and Daniel Seifert and Patricia Kelbert and Michael Kläs and Adam Trendowicz},
year={2023},
eprint={2307.04468},
archivePrefix={arXiv},
primaryClass={cs.LG}
}
```

## Contributing to Badgers

Expand Down
149 changes: 149 additions & 0 deletions docs/tutorials/Typos-Text.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "c695b94a-1032-46a3-9a97-421f1a4a43bd",
"metadata": {},
"source": [
"# Adding typos to text"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "d940b272-c15a-4b3b-b068-bc7308599534",
"metadata": {},
"outputs": [],
"source": [
"from badgers.generators.text.typos import SwapLettersGenerator, LeetSpeakGenerator"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "8d8acaee-95ca-4ac1-910d-6b79256564a7",
"metadata": {},
"outputs": [],
"source": [
"X = \"the quick brown fox jumps over the lazy dog\".split(' ')"
]
},
{
"cell_type": "markdown",
"id": "4da6eb3c-cd91-41ad-9db8-6a891bad6072",
"metadata": {},
"source": [
"## Swapping letter randomly"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "196be517-21ee-4f50-8a3c-57b454fcc470",
"metadata": {},
"outputs": [],
"source": [
"swap_letters = SwapLettersGenerator(swap_proba=1)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "bdbb18f3-29fd-48e0-8450-8de9266dd75a",
"metadata": {},
"outputs": [],
"source": [
"Xt, _ = swap_letters.generate(X.copy(), None)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "4a2c09cb-9a50-474e-be2f-a9a8a9362748",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']\n",
"['the', 'qucik', 'borwn', 'fox', 'jmups', 'oevr', 'the', 'lzay', 'dog']\n"
]
}
],
"source": [
"print(X)\n",
"print(Xt)"
]
},
{
"cell_type": "markdown",
"id": "d1c389f2-5ce5-4fe6-89d0-f8002949113e",
"metadata": {},
"source": [
"## Leet Speak"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "43e634b8-2657-4e28-ac59-fa190298937d",
"metadata": {},
"outputs": [],
"source": [
"leet_speak = LeetSpeakGenerator(replacement_proba=0.1)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "0b6f7e67-e68a-446c-a8e7-25823895ea0f",
"metadata": {},
"outputs": [],
"source": [
"Xt, _ = leet_speak.generate(X.copy(), None)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "04d104f6-da91-4e50-981a-72b703903a6a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']\n",
"['th3', 'quick', 'br0w^', 'fox', 'ju|\\\\|\\\\ps', 'over', 'the', 'lazy', 'dog']\n"
]
}
],
"source": [
"print(X)\n",
"print(Xt)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
2 changes: 2 additions & 0 deletions mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ nav:
- Generating Missing Data: tutorials/Missingness-Tabular-Data.ipynb
- Generating Noise: tutorials/Noise-Tabular-Data.ipynb
- Generating Outliers: tutorials/Outliers-Tabular-Data.ipynb
- Text Data:
- Generating Typos: tutorials/Typos-Text.ipynb
- Pipelines:
- Chaining Generators: tutorials/Pipeline-Tabular-Data.ipynb
- API: reference/
Expand Down
11 changes: 6 additions & 5 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "hatchling.build"

[project]
name = "badgers"
version = "0.0.2"
version = "0.0.3"
keywords = ["data quality", "bad data", "data science"]
authors = [
{ name = "Julien Siebert", email = "[email protected]" },
Expand All @@ -15,17 +15,18 @@ maintainers = [
description = "Badgers: bad data generators"
readme = "README.md"
requires-python = ">=3.8"
license = {file = "LICENSE"}
license = { file = "LICENSE" }
classifiers = [
"Programming Language :: Python :: 3",
"License :: OSI Approved :: BSD License",
"Operating System :: OS Independent",
"Development Status :: 1 - Planning",
]
dependencies = [
"numpy",
"pandas",
"scikit-learn",
"numpy",
"pandas",
"scikit-learn",
"networkx"
]
[project.urls]
"Homepage" = "https://github.com/Fraunhofer-IESE/badgers"
Expand Down
Empty file.
Loading

0 comments on commit 83bb5e0

Please sign in to comment.