Skip to content

Commit

Permalink
Merge pull request #306 from makcedward/dev
Browse files Browse the repository at this point in the history
Release 1.1.11
  • Loading branch information
makcedward authored Jul 7, 2022
2 parents 487d9c8 + d44804d commit 23800cb
Show file tree
Hide file tree
Showing 55 changed files with 639 additions and 605 deletions.
6 changes: 6 additions & 0 deletions CHANGE.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
NLPAUG Change Log
================

### 1.1.11 Jul 6, 2022
* [Return list of output](https://github.com/makcedward/nlpaug/issues/302)
* [Fix download util](https://github.com/makcedward/nlpaug/issues/301)
* [Fix lambda label misalignment](https://github.com/makcedward/nlpaug/issues/295)
* [Add language pack reference link for SynonymAug](https://github.com/makcedward/nlpaug/issues/289)

### 1.1.10 Dec 23, 2021
* [KeywordAug supports Turkish](https://github.com/makcedward/nlpaug/pull/261)
* [Fix FrequencyMasking time range ](https://github.com/makcedward/nlpaug/pull/258)
Expand Down
13 changes: 7 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -139,16 +139,17 @@ http://paraphrase.org/#/download

If you use PitchAug, SpeedAug and VtlpAug, installing the following dependencies as well
```bash
pip install librosa>=0.7.1 matplotlib
pip install librosa>=0.9.1 matplotlib
```

## Recent Changes

### 1.1.10 Dec 23, 2021
* [KeywordAug supports Turkish](https://github.com/makcedward/nlpaug/pull/261)
* [Fix FrequencyMasking time range ](https://github.com/makcedward/nlpaug/pull/258)
* [Remove unnecessary printout](https://github.com/makcedward/nlpaug/pull/263)
* [Rollback ContextualWordEmbsForSentenceAug and AbstSummAug to use custom transformers API to reduce execution time]
### 1.1.11 Jul 6, 2022
* [Return list of output](https://github.com/makcedward/nlpaug/issues/302)
* [Fix download util](https://github.com/makcedward/nlpaug/issues/301)
* [Fix lambda label misalignment](https://github.com/makcedward/nlpaug/issues/295)
* [Add language pack reference link for SynonymAug](https://github.com/makcedward/nlpaug/issues/289)


See [changelog](https://github.com/makcedward/nlpaug/blob/master/CHANGE.md) for more details.

Expand Down
4 changes: 2 additions & 2 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,9 +74,9 @@ def __getattr__(cls, name):
# built documents.
#
# The short X.Y version.
version = '1.1.11_dev'
version = '1.1.11'
# The full version, including alpha/beta/rc tags.
release = '1.1.11_dev'
release = '1.1.11'

# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
Expand Down
2 changes: 1 addition & 1 deletion nlpaug/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

__all__ = ['base_augmenter']

__version__ = '1.1.11_dev'
__version__ = '1.1.11'
__description__ = 'Natural language processing augmentation library for deep neural networks.'
__url__ = 'https://github.com/makcedward/nlpaug'
__author__ = 'Edward Ma'
Expand Down
12 changes: 7 additions & 5 deletions nlpaug/augmenter/sentence/context_word_embs_sentence.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"""

import os
from typing import Iterable

from nlpaug.augmenter.sentence import SentenceAugmenter
import nlpaug.model.lang_models as nml
Expand Down Expand Up @@ -102,14 +103,15 @@ def insert(self, data):
if not data:
return data

if isinstance(data, list):
all_data = data
else:
if isinstance(data, str):
if data.strip() == '':
return data

all_data = [data]

elif isinstance(data, Iterable):
all_data = data
else:
all_data = [data]

if self.use_custom_api:
return self._custom_insert(all_data)
else:
Expand Down
4 changes: 3 additions & 1 deletion nlpaug/augmenter/sentence/sentence_augmenter.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,11 @@ def __init__(self, action, name='Sentence_Aug', stopwords=None, tokenizer=None,

@classmethod
def clean(cls, data):
if isinstance(data, str):
return data.strip()
if isinstance(data, Iterable):
return [d.strip() for d in data]
return data.strip()
return str(data).strip()

@classmethod
def is_duplicate(cls, dataset, data):
Expand Down
4 changes: 3 additions & 1 deletion nlpaug/augmenter/word/synonym.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,9 @@ class SynonymAug(WordAugmenter):
:param str aug_src: Support 'wordnet' and 'ppdb' .
:param str model_path: Path of dictionary. Mandatory field if using PPDB as data source
:param str lang: Language of your text. Default value is 'eng'.
:param str lang: Language of your text. Default value is 'eng'. For `wordnet`, you can choose lang from this list
http://compling.hss.ntu.edu.sg/omw/. For `ppdb`, you simply download corresponding langauge pack from
http://paraphrase.org/#/download.
:param float aug_p: Percentage of word will be augmented.
:param int aug_min: Minimum number of word will be augmented.
:param int aug_max: Maximum number of word will be augmented. If None is passed, number of augmentation is
Expand Down
4 changes: 3 additions & 1 deletion nlpaug/augmenter/word/word_augmenter.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,11 @@ def __init__(self, action, name='Word_Aug', aug_min=1, aug_max=10, aug_p=0.3, st

@classmethod
def clean(cls, data):
if isinstance(data, str):
return data.strip()
if isinstance(data, Iterable) :
return [d.strip() if d else d for d in data]
return data.strip()
return str(data).strip()

def skip_aug(self, token_idxes, tokens):
return token_idxes
Expand Down
9 changes: 3 additions & 6 deletions nlpaug/base_augmenter.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,13 +63,13 @@ def augment(self, data, n=1, num_thread=1):

# Return empty value per data type
if isinstance(data, str):
return ''
return []
elif isinstance(data, list):
return []
elif isinstance(data, np.ndarray):
return np.array([])

return None
return []

action_fx = None
clean_data = self.clean(data)
Expand Down Expand Up @@ -125,10 +125,9 @@ def augment(self, data, n=1, num_thread=1):
if len(augmented_results) >= expected_output_num:
break

# TODO: standardize output to list even though n=1 from 1.0.0
if len(augmented_results) == 0:
# if not result, return itself
if n == 1:
if isinstance(data, list):
return data
# Single input with/without multiple input
else:
Expand All @@ -140,8 +139,6 @@ def augment(self, data, n=1, num_thread=1):
if isinstance(data, list):
return augmented_results
else:
if n == 1:
return augmented_results[0]
return augmented_results[:n]

# return augmented_results
Expand Down
24 changes: 8 additions & 16 deletions nlpaug/flow/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ def augment(self, data, n=1, num_thread=1):
else:
if self.device == 'cpu':
augmented_results = self._parallel_augment(self._augment, data, n=n, num_thread=num_thread)

# TODO: Externalize to util for checking
elif 'cuda' in self.device:
# TODO: support multiprocessing for GPU
Expand All @@ -67,24 +68,21 @@ def augment(self, data, n=1, num_thread=1):
else:
raise ValueError('Unsupported device mode [{}]. Only support `cpu` or `cuda`'.format(self.device))

# Flatten nested list
augmented_results = [r for sub_results in augmented_results for r in sub_results if len(r) > 0]
for augmented_result in augmented_results:
if is_duplicate_fx is not None and not is_duplicate_fx(results + [data], augmented_result):
results.append(augmented_result)
results.extend(augmented_result)

if len(results) >= n:
break
if len(results) >= n:
break

# TODO: standardize output to list even though n=1
if len(results) == 0:
# if not result, return itself
if n == 1:
return data
else:
return [data]
if n == 1:
return results[0]
if len(data) == 0:
return []
return [data]
return results[:n]

def _augment(self, data, n=1, num_thread=1):
Expand Down Expand Up @@ -115,16 +113,10 @@ def _augment(self, data, n=1, num_thread=1):
results.append(augmented_data)
break

# TODO: standardize output to list even though n=1
output = None
if len(results) == 0:
# if not result, return itself
if n == 1:
output = data
else:
output = [data]
elif n == 1:
output = results[0]
output = [data]
else:
output = results[:n]

Expand Down
2 changes: 1 addition & 1 deletion nlpaug/model/audio/pitch.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,6 @@ def __init__(self):
def manipulate(self, data, start_pos, end_pos, pitch_level, sampling_rate):
aug_data = data.copy()
aug_data[start_pos:end_pos] = librosa.effects.pitch_shift(
aug_data[start_pos:end_pos], sampling_rate, pitch_level)
y=aug_data[start_pos:end_pos], sr=sampling_rate, n_steps=pitch_level)

return aug_data
2 changes: 1 addition & 1 deletion nlpaug/model/audio/speed.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,5 +19,5 @@ def __init__(self):
raise ModuleNotFoundError('Missed librosa library. Install it via `pip install librosa`')

def manipulate(self, data, start_pos, end_pos, speed):
aug_data = librosa.effects.time_stretch(data[start_pos:end_pos], speed)
aug_data = librosa.effects.time_stretch(y=data[start_pos:end_pos], rate=speed)
return np.concatenate((data[:start_pos], aug_data, data[end_pos:]), axis=0)
2 changes: 1 addition & 1 deletion nlpaug/model/lang_models/lambada.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def _generate(self, texts, n):
results = []
# Encode
for label in texts:
input_text = 'label_{} {}'.format(label, self.sep_token)
input_text = '{} {}'.format(label, self.sep_token)
input_ids = self.gen_tokenizer.encode(input_text, add_special_tokens=False, return_tensors='pt')
input_ids = input_ids.to(self.device)

Expand Down
16 changes: 14 additions & 2 deletions nlpaug/model/word_dict/wordnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,13 @@ def __init__(self, lang, is_synonym=True):
self.model = self.read()

def read(self):
return wordnet
try:
wordnet.synsets('testing')
return wordnet
except LookupError:
nltk.download('wordnet')
nltk.download('omw-1.4')
return wordnet

def predict(self, word, pos=None):
results = []
Expand All @@ -48,4 +54,10 @@ def predict(self, word, pos=None):

@classmethod
def pos_tag(cls, tokens):
return nltk.pos_tag(tokens)
try:
results = nltk.pos_tag(tokens)
except LookupError:
nltk.download('averaged_perceptron_tagger')
results = nltk.pos_tag(tokens)

return results
3 changes: 2 additions & 1 deletion requirements_dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@ pyinstrument
transformers
torch
simpletransformers
gensim>=4.1.2
gensim>=4.1.2
librosa>=0.9
2 changes: 1 addition & 1 deletion scripts/lambada/data_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
def prepare_mlm_data(labels, texts, output_file_path, sep_token):
with open(os.path.join(output_file_path, 'mlm_data.txt'), 'w') as f:
for label, text in zip(labels, texts):
f.write(' '.join([label, sep_token, text]) + '\n')
f.write(' '.join([str(label), sep_token, text]) + '\n')

def main(args):
data = pd.read_csv(args.data_path)
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

setup(
name="nlpaug",
version="1.1.11_dev",
version="1.1.11",
author="Edward Ma",
author_email="[email protected]",
url="https://github.com/makcedward/nlpaug",
Expand Down
3 changes: 2 additions & 1 deletion test/augmenter/audio/test_audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,4 +51,5 @@ def test_coverage_and_zone(self):

for aug in augs:
aug_data = aug.augment(self.audio)
self.assertTrue(len(aug_data[aug.start_pos:aug.end_pos]), int(len(self.audio) * (zone[1] - zone[0]) * coverage))
aug_audio = aug_data[0]
self.assertTrue(len(aug_audio[aug.start_pos:aug.end_pos]), int(len(self.audio) * (zone[1] - zone[0]) * coverage))
17 changes: 11 additions & 6 deletions test/augmenter/audio/test_crop.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,21 +22,24 @@ def setUpClass(cls):
def test_empty_input(self):
audio = np.array([])
aug = naa.CropAug(sampling_rate=self.sampling_rate)
augmented_audio = aug.augment(audio)
augmented_data = aug.augment(audio)

self.assertTrue(np.array_equal(audio, augmented_audio))
self.assertTrue(np.array_equal(audio, augmented_data))

def test_substitute(self):
aug = naa.CropAug(sampling_rate=self.sampling_rate)
augmented_audio = aug.augment(self.audio)
augmented_data = aug.augment(self.audio)
augmented_audio = augmented_data[0]

self.assertNotEqual(len(self.audio), len(augmented_audio))

def test_coverage(self):
aug = naa.CropAug(sampling_rate=self.sampling_rate, coverage=0.1)
augmented_data = aug.augment(self.audio)
augmented_audio = augmented_data[0]

audio_size = len(self.audio)
augmented_size = len(augmented_data)
augmented_size = len(augmented_audio)
expected_crop_size = len(self.audio) * (aug.zone[1] - aug.zone[0]) * 0.1

self.assertTrue(-1 <= audio_size - augmented_size - expected_crop_size <= 1)
Expand All @@ -47,8 +50,10 @@ def test_duration(self):

for _ in range(10):
aug = naa.CropAug(sampling_rate=self.sampling_rate, duration=duration, stateless=False)
aug_data = aug.augment(self.audio)
aug_size = len(aug_data)
augmented_data = aug.augment(self.audio)
augmented_audio = augmented_data[0]

aug_size = len(augmented_audio)
expected_crop_size = self.sampling_rate * duration

self.assertGreater(audio_size, aug_size)
Expand Down
7 changes: 4 additions & 3 deletions test/augmenter/audio/test_inversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,14 @@ def setUpClass(cls):
def test_empty_input(self):
audio = np.array([])
aug = naa.PolarityInverseAug()
augmented_audio = aug.augment(audio)
augmented_data = aug.augment(audio)

self.assertTrue(np.array_equal(audio, augmented_audio))
self.assertTrue(np.array_equal(audio, augmented_data))

def test_inverse(self):
aug = naa.PolarityInverseAug()
augmented_audio = aug.augment(self.audio)
augmented_data = aug.augment(self.audio)
augmented_audio = augmented_data[0]

self.assertFalse(np.array_equal(self.audio, augmented_audio))
self.assertEqual(len(self.audio), len(augmented_audio))
7 changes: 4 additions & 3 deletions test/augmenter/audio/test_loudness.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,14 @@ def setUpClass(cls):
def test_empty_input(self):
audio = np.array([])
aug = naa.LoudnessAug()
augmented_audio = aug.augment(audio)
augmented_data = aug.augment(audio)

self.assertTrue(np.array_equal(audio, augmented_audio))
self.assertTrue(np.array_equal(audio, augmented_data))

def test_substitute(self):
aug = naa.LoudnessAug()
augmented_audio = aug.augment(self.audio)
augmented_data = aug.augment(self.audio)
augmented_audio = augmented_data[0]

self.assertFalse(np.array_equal(self.audio, augmented_audio))
self.assertEqual(len(self.audio), len(augmented_audio))
Expand Down
Loading

0 comments on commit 23800cb

Please sign in to comment.