Merge pull request #306 from makcedward/dev

Release 1.1.11
makcedward · Jul 7, 2022 · 23800cb · 23800cb
2 parents 487d9c8 + d44804d
commit 23800cb
Show file tree

Hide file tree

Showing 55 changed files with 639 additions and 605 deletions.
diff --git a/CHANGE.md b/CHANGE.md
@@ -1,6 +1,12 @@
 NLPAUG Change Log
 ================
 
+### 1.1.11 Jul 6, 2022
+*   [Return list of output](https://github.com/makcedward/nlpaug/issues/302)
+*   [Fix download util](https://github.com/makcedward/nlpaug/issues/301)
+*   [Fix lambda label misalignment](https://github.com/makcedward/nlpaug/issues/295)
+*   [Add language pack reference link for SynonymAug](https://github.com/makcedward/nlpaug/issues/289)
+
 ### 1.1.10 Dec 23, 2021
 *   [KeywordAug supports Turkish](https://github.com/makcedward/nlpaug/pull/261)
 *   [Fix FrequencyMasking time range ](https://github.com/makcedward/nlpaug/pull/258)

diff --git a/README.md b/README.md
@@ -139,16 +139,17 @@ http://paraphrase.org/#/download
 
 If you use PitchAug, SpeedAug and VtlpAug, installing the following dependencies as well
 ```bash
-pip install librosa>=0.7.1 matplotlib
+pip install librosa>=0.9.1 matplotlib
 ```
 
 ## Recent Changes
 
-### 1.1.10 Dec 23, 2021
-*   [KeywordAug supports Turkish](https://github.com/makcedward/nlpaug/pull/261)
-*   [Fix FrequencyMasking time range ](https://github.com/makcedward/nlpaug/pull/258)
-*   [Remove unnecessary printout](https://github.com/makcedward/nlpaug/pull/263)
-*   [Rollback ContextualWordEmbsForSentenceAug and AbstSummAug to use custom transformers API to reduce execution time]
+### 1.1.11 Jul 6, 2022
+*   [Return list of output](https://github.com/makcedward/nlpaug/issues/302)
+*   [Fix download util](https://github.com/makcedward/nlpaug/issues/301)
+*   [Fix lambda label misalignment](https://github.com/makcedward/nlpaug/issues/295)
+*   [Add language pack reference link for SynonymAug](https://github.com/makcedward/nlpaug/issues/289)
+
 
 See [changelog](https://github.com/makcedward/nlpaug/blob/master/CHANGE.md) for more details.
 

diff --git a/docs/conf.py b/docs/conf.py
@@ -74,9 +74,9 @@ def __getattr__(cls, name):
 # built documents.
 #
 # The short X.Y version.
-version = '1.1.11_dev'
+version = '1.1.11'
 # The full version, including alpha/beta/rc tags.
-release = '1.1.11_dev'
+release = '1.1.11'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

diff --git a/nlpaug/__init__.py b/nlpaug/__init__.py
@@ -3,7 +3,7 @@
 
 __all__ = ['base_augmenter']
 
-__version__ = '1.1.11_dev'
+__version__ = '1.1.11'
 __description__ = 'Natural language processing augmentation library for deep neural networks.'
 __url__ = 'https://github.com/makcedward/nlpaug'
 __author__ = 'Edward Ma'

diff --git a/nlpaug/augmenter/sentence/context_word_embs_sentence.py b/nlpaug/augmenter/sentence/context_word_embs_sentence.py
@@ -3,6 +3,7 @@
 """
 
 import os
+from typing import Iterable
 
 from nlpaug.augmenter.sentence import SentenceAugmenter
 import nlpaug.model.lang_models as nml
@@ -102,14 +103,15 @@ def insert(self, data):
         if not data:
             return data
 
-        if isinstance(data, list):
-            all_data = data
-        else:
+        if isinstance(data, str):
             if data.strip() == '':
                 return data
-
             all_data = [data]
-
+        elif isinstance(data, Iterable):
+            all_data = data
+        else:
+            all_data = [data]
+
         if self.use_custom_api:
             return self._custom_insert(all_data)
         else:

diff --git a/nlpaug/augmenter/sentence/sentence_augmenter.py b/nlpaug/augmenter/sentence/sentence_augmenter.py
@@ -18,9 +18,11 @@ def __init__(self, action, name='Sentence_Aug', stopwords=None, tokenizer=None,
 
     @classmethod
     def clean(cls, data):
+        if isinstance(data, str):
+            return data.strip()
         if isinstance(data, Iterable):
             return [d.strip() for d in data]
-        return data.strip()
+        return str(data).strip()
 
     @classmethod
     def is_duplicate(cls, dataset, data):

diff --git a/nlpaug/augmenter/word/synonym.py b/nlpaug/augmenter/word/synonym.py
@@ -32,7 +32,9 @@ class SynonymAug(WordAugmenter):
 
     :param str aug_src: Support 'wordnet' and 'ppdb' .
     :param str model_path: Path of dictionary. Mandatory field if using PPDB as data source
-    :param str lang: Language of your text. Default value is 'eng'.
+    :param str lang: Language of your text. Default value is 'eng'. For `wordnet`, you can choose lang from this list
+        http://compling.hss.ntu.edu.sg/omw/. For `ppdb`, you simply download corresponding langauge pack from
+        http://paraphrase.org/#/download.
     :param float aug_p: Percentage of word will be augmented.
     :param int aug_min: Minimum number of word will be augmented.
     :param int aug_max: Maximum number of word will be augmented. If None is passed, number of augmentation is

diff --git a/nlpaug/augmenter/word/word_augmenter.py b/nlpaug/augmenter/word/word_augmenter.py
@@ -23,9 +23,11 @@ def __init__(self, action, name='Word_Aug', aug_min=1, aug_max=10, aug_p=0.3, st
 
     @classmethod
     def clean(cls, data):
+        if isinstance(data, str):
+            return data.strip()
         if isinstance(data, Iterable) :
             return [d.strip() if d else d for d in data]
-        return data.strip()
+        return str(data).strip()
 
     def skip_aug(self, token_idxes, tokens):
         return token_idxes

diff --git a/nlpaug/base_augmenter.py b/nlpaug/base_augmenter.py
@@ -63,13 +63,13 @@ def augment(self, data, n=1, num_thread=1):
 
                 # Return empty value per data type
                 if isinstance(data, str):
-                    return ''
+                    return []
                 elif isinstance(data, list):
                     return []
                 elif isinstance(data, np.ndarray):
                     return np.array([])
 
-                return None
+                return []
 
         action_fx = None
         clean_data = self.clean(data)
@@ -125,10 +125,9 @@ def augment(self, data, n=1, num_thread=1):
             if len(augmented_results) >= expected_output_num:
                 break
 
-         # TODO: standardize output to list even though n=1 from 1.0.0
         if len(augmented_results) == 0:
             # if not result, return itself
-            if n == 1:
+            if isinstance(data, list):
                 return data
             # Single input with/without multiple input
             else:
@@ -140,8 +139,6 @@ def augment(self, data, n=1, num_thread=1):
             if isinstance(data, list):
                 return augmented_results
             else:
-                if n == 1:
-                    return augmented_results[0]
                 return augmented_results[:n]
 
         # return augmented_results

diff --git a/nlpaug/flow/pipeline.py b/nlpaug/flow/pipeline.py
@@ -59,6 +59,7 @@ def augment(self, data, n=1, num_thread=1):
             else:
                 if self.device == 'cpu':
                     augmented_results = self._parallel_augment(self._augment, data, n=n, num_thread=num_thread)
+
                 # TODO: Externalize to util for checking
                 elif 'cuda' in self.device:
                     # TODO: support multiprocessing for GPU
@@ -67,24 +68,21 @@ def augment(self, data, n=1, num_thread=1):
                 else:
                     raise ValueError('Unsupported device mode [{}]. Only support `cpu` or `cuda`'.format(self.device))
 
+            # Flatten nested list
+            augmented_results = [r for sub_results in augmented_results for r in sub_results if len(r) > 0]
             for augmented_result in augmented_results:
                 if is_duplicate_fx is not None and not is_duplicate_fx(results + [data], augmented_result):
-                    results.append(augmented_result)
+                    results.extend(augmented_result)
 
                 if len(results) >= n:
                     break
             if len(results) >= n:
                 break
 
-        # TODO: standardize output to list even though n=1
         if len(results) == 0:
-            # if not result, return itself
-            if n == 1:
-                return data
-            else:
-                return [data]
-        if n == 1:
-            return results[0]
+            if len(data) == 0:
+                return []
+            return [data]
         return results[:n]
 
     def _augment(self, data, n=1, num_thread=1):
@@ -115,16 +113,10 @@ def _augment(self, data, n=1, num_thread=1):
                 results.append(augmented_data)
             break
 
-        # TODO: standardize output to list even though n=1
         output = None
         if len(results) == 0:
             # if not result, return itself
-            if n == 1:
-                output = data
-            else:
-                output = [data]
-        elif n == 1:
-            output = results[0]
+            output = [data]
         else:
             output = results[:n]
 

diff --git a/nlpaug/model/audio/pitch.py b/nlpaug/model/audio/pitch.py
@@ -21,6 +21,6 @@ def __init__(self):
     def manipulate(self, data, start_pos, end_pos, pitch_level, sampling_rate):
         aug_data = data.copy()
         aug_data[start_pos:end_pos] = librosa.effects.pitch_shift(
-            aug_data[start_pos:end_pos], sampling_rate, pitch_level)
+            y=aug_data[start_pos:end_pos], sr=sampling_rate, n_steps=pitch_level)
 
         return aug_data
diff --git a/nlpaug/model/audio/speed.py b/nlpaug/model/audio/speed.py
@@ -19,5 +19,5 @@ def __init__(self):
             raise ModuleNotFoundError('Missed librosa library. Install it via `pip install librosa`')
 
     def manipulate(self, data, start_pos, end_pos, speed):
-        aug_data = librosa.effects.time_stretch(data[start_pos:end_pos], speed)
+        aug_data = librosa.effects.time_stretch(y=data[start_pos:end_pos], rate=speed)
         return np.concatenate((data[:start_pos], aug_data, data[end_pos:]), axis=0)
diff --git a/nlpaug/model/lang_models/lambada.py b/nlpaug/model/lang_models/lambada.py
@@ -64,7 +64,7 @@ def _generate(self, texts, n):
 		results = []
 		# Encode
 		for label in texts:
-			input_text = 'label_{} {}'.format(label, self.sep_token)
+			input_text = '{} {}'.format(label, self.sep_token)
 			input_ids = self.gen_tokenizer.encode(input_text, add_special_tokens=False, return_tensors='pt')
 			input_ids = input_ids.to(self.device)
 

diff --git a/nlpaug/model/word_dict/wordnet.py b/nlpaug/model/word_dict/wordnet.py
@@ -33,7 +33,13 @@ def __init__(self, lang, is_synonym=True):
         self.model = self.read()
 
     def read(self):
-        return wordnet
+        try:
+            wordnet.synsets('testing')
+            return wordnet
+        except LookupError:
+            nltk.download('wordnet')
+            nltk.download('omw-1.4')
+            return wordnet
 
     def predict(self, word, pos=None):
         results = []
@@ -48,4 +54,10 @@ def predict(self, word, pos=None):
 
     @classmethod
     def pos_tag(cls, tokens):
-        return nltk.pos_tag(tokens)
+        try:
+            results = nltk.pos_tag(tokens)
+        except LookupError:
+            nltk.download('averaged_perceptron_tagger')
+            results = nltk.pos_tag(tokens)
+
+        return results
diff --git a/requirements_dev.txt b/requirements_dev.txt
@@ -5,4 +5,5 @@ pyinstrument
 transformers
 torch
 simpletransformers
-gensim>=4.1.2
+gensim>=4.1.2
+librosa>=0.9
diff --git a/scripts/lambada/data_processing.py b/scripts/lambada/data_processing.py
@@ -6,7 +6,7 @@
 def prepare_mlm_data(labels, texts, output_file_path, sep_token):
 	with open(os.path.join(output_file_path, 'mlm_data.txt'), 'w') as f:
 		for label, text in zip(labels, texts):
-			f.write(' '.join([label, sep_token, text]) + '\n')
+			f.write(' '.join([str(label), sep_token, text]) + '\n')
 
 def main(args):
 	data = pd.read_csv(args.data_path)

diff --git a/setup.py b/setup.py
@@ -12,7 +12,7 @@
 
 setup(
     name="nlpaug",
-    version="1.1.11_dev",
+    version="1.1.11",
     author="Edward Ma",
     author_email="[email protected]",
     url="https://github.com/makcedward/nlpaug",

diff --git a/test/augmenter/audio/test_audio.py b/test/augmenter/audio/test_audio.py
@@ -51,4 +51,5 @@ def test_coverage_and_zone(self):
 
             for aug in augs:
                 aug_data = aug.augment(self.audio)
-                self.assertTrue(len(aug_data[aug.start_pos:aug.end_pos]), int(len(self.audio) * (zone[1] - zone[0]) * coverage))
+                aug_audio = aug_data[0]
+                self.assertTrue(len(aug_audio[aug.start_pos:aug.end_pos]), int(len(self.audio) * (zone[1] - zone[0]) * coverage))
diff --git a/test/augmenter/audio/test_crop.py b/test/augmenter/audio/test_crop.py
@@ -22,21 +22,24 @@ def setUpClass(cls):
     def test_empty_input(self):
         audio = np.array([])
         aug = naa.CropAug(sampling_rate=self.sampling_rate)
-        augmented_audio = aug.augment(audio)
+        augmented_data = aug.augment(audio)
 
-        self.assertTrue(np.array_equal(audio, augmented_audio))
+        self.assertTrue(np.array_equal(audio, augmented_data))
 
     def test_substitute(self):
         aug = naa.CropAug(sampling_rate=self.sampling_rate)
-        augmented_audio = aug.augment(self.audio)
+        augmented_data = aug.augment(self.audio)
+        augmented_audio = augmented_data[0]
 
         self.assertNotEqual(len(self.audio), len(augmented_audio))
 
     def test_coverage(self):
         aug = naa.CropAug(sampling_rate=self.sampling_rate, coverage=0.1)
         augmented_data = aug.augment(self.audio)
+        augmented_audio = augmented_data[0]
+
         audio_size = len(self.audio)
-        augmented_size = len(augmented_data)
+        augmented_size = len(augmented_audio)
         expected_crop_size = len(self.audio) * (aug.zone[1] - aug.zone[0]) * 0.1
 
         self.assertTrue(-1 <= audio_size - augmented_size - expected_crop_size <= 1)
@@ -47,8 +50,10 @@ def test_duration(self):
 
         for _ in range(10):
             aug = naa.CropAug(sampling_rate=self.sampling_rate, duration=duration, stateless=False)
-            aug_data = aug.augment(self.audio)
-            aug_size = len(aug_data)
+            augmented_data = aug.augment(self.audio)
+            augmented_audio = augmented_data[0]
+
+            aug_size = len(augmented_audio)
             expected_crop_size = self.sampling_rate * duration
 
             self.assertGreater(audio_size, aug_size)

diff --git a/test/augmenter/audio/test_inversion.py b/test/augmenter/audio/test_inversion.py
@@ -22,13 +22,14 @@ def setUpClass(cls):
     def test_empty_input(self):
         audio = np.array([])
         aug = naa.PolarityInverseAug()
-        augmented_audio = aug.augment(audio)
+        augmented_data = aug.augment(audio)
 
-        self.assertTrue(np.array_equal(audio, augmented_audio))
+        self.assertTrue(np.array_equal(audio, augmented_data))
 
     def test_inverse(self):
         aug = naa.PolarityInverseAug()
-        augmented_audio = aug.augment(self.audio)
+        augmented_data = aug.augment(self.audio)
+        augmented_audio = augmented_data[0]
 
         self.assertFalse(np.array_equal(self.audio, augmented_audio))
         self.assertEqual(len(self.audio), len(augmented_audio))
diff --git a/test/augmenter/audio/test_loudness.py b/test/augmenter/audio/test_loudness.py
@@ -22,13 +22,14 @@ def setUpClass(cls):
     def test_empty_input(self):
         audio = np.array([])
         aug = naa.LoudnessAug()
-        augmented_audio = aug.augment(audio)
+        augmented_data = aug.augment(audio)
 
-        self.assertTrue(np.array_equal(audio, augmented_audio))
+        self.assertTrue(np.array_equal(audio, augmented_data))
 
     def test_substitute(self):
         aug = naa.LoudnessAug()
-        augmented_audio = aug.augment(self.audio)
+        augmented_data = aug.augment(self.audio)
+        augmented_audio = augmented_data[0]
 
         self.assertFalse(np.array_equal(self.audio, augmented_audio))
         self.assertEqual(len(self.audio), len(augmented_audio))