From 03b31e2e9e459c22aef533f513613ff606e6c2a0 Mon Sep 17 00:00:00 2001 From: Marco Di Giovanni Date: Thu, 29 Jul 2021 16:51:48 +0200 Subject: [PATCH 1/6] Added underscore_trick --- transformations/underscore_trick/README.md | 20 ++++++++ transformations/underscore_trick/__init__.py | 1 + transformations/underscore_trick/test.json | 50 +++++++++++++++++++ .../underscore_trick/transformation.py | 42 ++++++++++++++++ 4 files changed, 113 insertions(+) create mode 100644 transformations/underscore_trick/README.md create mode 100644 transformations/underscore_trick/__init__.py create mode 100644 transformations/underscore_trick/test.json create mode 100644 transformations/underscore_trick/transformation.py diff --git a/transformations/underscore_trick/README.md b/transformations/underscore_trick/README.md new file mode 100644 index 000000000..af0561df6 --- /dev/null +++ b/transformations/underscore_trick/README.md @@ -0,0 +1,20 @@ +# Underscore Trick +This perturbation adds noise to all types of text sources (sentence, paragraph, etc.). + +Author name: Marco Di Giovanni +Author email: marco.digiovanni@polimi.it +Author Affiliation: Politecnico di Milano and University of Bologna + +## What type of a transformation is this? +This transformation acts like a perturbation to test robustness. +It replaces some random spaces with underscores. +This transformation mimics particular behaviours: +- names of folders, files, classes, functions (e.g., underscore_trick) +- trick word counters when there is a limited maximum number of words to insert in an online questionary/quiz. +Generated transformations display high similarity to the source sentences. + +## What tasks does it intend to benefit? +This perturbation would benefit all tasks which have a sentence/paragraph/document as input like text classification, text generation, etc. Especially on tasks related to understanding/generating scripts. + +## What are the limitations of this transformation? +The transformation's outputs are extremely simple to be used for data augmentation. diff --git a/transformations/underscore_trick/__init__.py b/transformations/underscore_trick/__init__.py new file mode 100644 index 000000000..930cdce0b --- /dev/null +++ b/transformations/underscore_trick/__init__.py @@ -0,0 +1 @@ +from .transformation import * diff --git a/transformations/underscore_trick/test.json b/transformations/underscore_trick/test.json new file mode 100644 index 000000000..29640c164 --- /dev/null +++ b/transformations/underscore_trick/test.json @@ -0,0 +1,50 @@ +{ + "type": "underscore_trick", + "test_cases": [ + { + "class": "UnderscoreTrick", + "inputs": { + "sentence": "Andrew finally returned the French book to Chris that I bought last week" + }, + "outputs": [{ + "sentence": "Andrew finally_returned the French book to Chris that I_bought last week" + }] + }, + { + "class": "UnderscoreTrick", + "inputs": { + "sentence": "Sentences with gapping, such as Paul likes coffee and Mary tea, lack an overt predicate to indicate the relation between two or more arguments." + }, + "outputs": [{ + "sentence": "Sentences with_gapping, such as Paul likes coffee and Mary_tea, lack an_overt predicate to indicate the relation between_two or more arguments." + }] + }, + { + "class": "UnderscoreTrick", + "inputs": { + "sentence": "Alice in Wonderland is a 2010 American live-action/animated dark fantasy adventure film" + }, + "outputs": [{ + "sentence": "Alice in_Wonderland is a 2010 American live-action/animated dark fantasy_adventure film" + }] + }, + { + "class": "UnderscoreTrick", + "inputs": { + "sentence": "Ujjal Dev Dosanjh served as 33rd Premier of British Columbia from 2000 to 2001" + }, + "outputs": [{ + "sentence": "Ujjal Dev_Dosanjh served as 33rd Premier of British Columbia_from 2000 to_2001" + }] + }, + { + "class": "UnderscoreTrick", + "inputs": { + "sentence": "Neuroplasticity is a continuous processing allowing short-term, medium-term, and long-term remodeling of the neuronosynaptic organization." + }, + "outputs": [{ + "sentence": "Neuroplasticity is_a continuous processing allowing short-term, medium-term, and long-term_remodeling of the_neuronosynaptic organization." + }] + } + ] +} diff --git a/transformations/underscore_trick/transformation.py b/transformations/underscore_trick/transformation.py new file mode 100644 index 000000000..4263dac83 --- /dev/null +++ b/transformations/underscore_trick/transformation.py @@ -0,0 +1,42 @@ +import itertools +import random + +from interfaces.SentenceOperation import SentenceOperation +from tasks.TaskTypes import TaskType + + +def add_underscore(text, prob=0.05, seed=42, max_outputs=1): + random.seed(seed) + + perturbed_texts = [] + for _ in itertools.repeat(None, max_outputs): + perturbed_text = "".join( + [ + letter if letter != " " or random.random() > prob else "_" + for letter in text + ] + ) + perturbed_texts.append(perturbed_text) + return perturbed_texts + + +class UnderscoreTrick(SentenceOperation): + tasks = [ + TaskType.TEXT_CLASSIFICATION, + TaskType.TEXT_TO_TEXT_GENERATION, + TaskType.TEXT_TAGGING, + ] + languages = ["All"] + + def __init__(self, seed=42, max_outputs=1, prob=0.05): + super().__init__(seed, max_outputs=max_outputs) + self.prob = prob + + def generate(self, sentence: str): + perturbed_texts = add_underscore( + text=sentence, + prob=self.prob, + seed=self.seed, + max_outputs=self.max_outputs, + ) + return perturbed_texts From 4cdd971def388dfea87340f0090d37051ea2c4cc Mon Sep 17 00:00:00 2001 From: Marco Di Giovanni Date: Fri, 30 Jul 2021 09:31:32 +0200 Subject: [PATCH 2/6] Added character_duplication --- .../character_duplication/README.md | 21 ++++++++ .../character_duplication/__init__.py | 1 + .../character_duplication/test.json | 50 +++++++++++++++++++ .../character_duplication/transformation.py | 44 ++++++++++++++++ 4 files changed, 116 insertions(+) create mode 100644 transformations/character_duplication/README.md create mode 100644 transformations/character_duplication/__init__.py create mode 100644 transformations/character_duplication/test.json create mode 100644 transformations/character_duplication/transformation.py diff --git a/transformations/character_duplication/README.md b/transformations/character_duplication/README.md new file mode 100644 index 000000000..d01159082 --- /dev/null +++ b/transformations/character_duplication/README.md @@ -0,0 +1,21 @@ +# Character Duplication +This perturbation adds noise to all types of text sources (sentence, paragraph, etc.) proportional to noise erupting from keyboard typos making common spelling errors. + +Author name: Marco Di Giovanni +Author email: marco.digiovanni@polimi.it +Author Affiliation: Politecnico di Milano and University of Bologna + + + +## What type of a transformation is this? +This transformation acts like a perturbation to test robustness. +Few letters picked at random are duplicated. +Generated transformations display high similarity to the source sentences. + +## What tasks does it intend to benefit? +- This perturbation would benefit all tasks which have a sentence/paragraph/document as input like text classification, text generation, etc. +- The generated texts mimic typing mistakes. + +## What are the limitations of this transformation? +- This transformation is not capable of generating linguistically diverse text. +- This transformation will mainly affect the perfornamce of token/word-level models, while character-level models should be much robust. diff --git a/transformations/character_duplication/__init__.py b/transformations/character_duplication/__init__.py new file mode 100644 index 000000000..930cdce0b --- /dev/null +++ b/transformations/character_duplication/__init__.py @@ -0,0 +1 @@ +from .transformation import * diff --git a/transformations/character_duplication/test.json b/transformations/character_duplication/test.json new file mode 100644 index 000000000..c0f2d03d1 --- /dev/null +++ b/transformations/character_duplication/test.json @@ -0,0 +1,50 @@ +{ + "type": "character_duplication", + "test_cases": [ + { + "class": "CharacterDuplication", + "inputs": { + "sentence": "Andrew finally returned the French book to Chris that I bought last week" + }, + "outputs": [{ + "sentence": "Anndrew ffinnallly returrned thee French book too Chhris that I bought last week" + }] + }, + { + "class": "CharacterDuplication", + "inputs": { + "sentence": "Sentences with gapping, such as Paul likes coffee and Mary tea, lack an overt predicate to indicate the relation between two or more arguments." + }, + "outputs": [{ + "sentence": "Seentencees witth gappiing, succhh as Paul likess cooffee and Mary tea, lackk an overt predicate ttoo indiicate tthe relation between two orr moree arrguuments." + }] + }, + { + "class": "CharacterDuplication", + "inputs": { + "sentence": "Alice in Wonderland is a 2010 American live-action/animated dark fantasy adventure film" + }, + "outputs": [{ + "sentence": "Allice inn WWondderland is a 200110 American livve-aaction/animated dark fanntasy adventure film" + }] + }, + { + "class": "CharacterDuplication", + "inputs": { + "sentence": "Ujjal Dev Dosanjh served as 33rd Premier of British Columbia from 2000 to 2001" + }, + "outputs": [{ + "sentence": "Ujjjal Deev Dossanjh seerved ass 33rd Premier oof BBritish Columbia from 20000 to 2001" + }] + }, + { + "class": "CharacterDuplication", + "inputs": { + "sentence": "Neuroplasticity is a continuous processing allowing short-term, medium-term, and long-term remodeling of the neuronosynaptic organization." + }, + "outputs": [{ + "sentence": "Neeuroplaastticiity is aa continnuuous processingg alllowing short-term, mediium-term, and long-terrmm remoodelingg of the neuronosynaptic orrganizzatiionn." + }] + } + ] +} diff --git a/transformations/character_duplication/transformation.py b/transformations/character_duplication/transformation.py new file mode 100644 index 000000000..37334e3a1 --- /dev/null +++ b/transformations/character_duplication/transformation.py @@ -0,0 +1,44 @@ +import itertools +import random + +from interfaces.SentenceOperation import SentenceOperation +from tasks.TaskTypes import TaskType + + +def duplicate(text, prob=0.1, seed=42, max_outputs=1): + random.seed(seed) + + original_text = list(text) + perturbed_texts = [] + for _ in itertools.repeat(None, max_outputs): + perturbed_text = [ + [letter] if random.random() > prob else [letter, letter] + for letter in original_text + ] + perturbed_text = [ + letter for sublist in perturbed_text for letter in sublist + ] + perturbed_texts.append("".join(perturbed_text)) + return perturbed_texts + + +class CharacterDuplication(SentenceOperation): + tasks = [ + TaskType.TEXT_CLASSIFICATION, + TaskType.TEXT_TO_TEXT_GENERATION, + TaskType.TEXT_TAGGING, + ] + languages = ["All"] + + def __init__(self, seed=42, max_outputs=1, prob=0.1): + super().__init__(seed, max_outputs=max_outputs) + self.prob = prob + + def generate(self, sentence: str): + perturbed_texts = duplicate( + text=sentence, + prob=self.prob, + seed=self.seed, + max_outputs=self.max_outputs, + ) + return perturbed_texts From 92da719612c3be2a31b3e26d5128290d5b98fe90 Mon Sep 17 00:00:00 2001 From: marco-digio Date: Wed, 4 Aug 2021 10:29:15 +0200 Subject: [PATCH 3/6] Removed wrong files (different branch) --- transformations/underscore_trick/README.md | 20 -------- transformations/underscore_trick/__init__.py | 1 - transformations/underscore_trick/test.json | 50 ------------------- .../underscore_trick/transformation.py | 42 ---------------- 4 files changed, 113 deletions(-) delete mode 100644 transformations/underscore_trick/README.md delete mode 100644 transformations/underscore_trick/__init__.py delete mode 100644 transformations/underscore_trick/test.json delete mode 100644 transformations/underscore_trick/transformation.py diff --git a/transformations/underscore_trick/README.md b/transformations/underscore_trick/README.md deleted file mode 100644 index af0561df6..000000000 --- a/transformations/underscore_trick/README.md +++ /dev/null @@ -1,20 +0,0 @@ -# Underscore Trick -This perturbation adds noise to all types of text sources (sentence, paragraph, etc.). - -Author name: Marco Di Giovanni -Author email: marco.digiovanni@polimi.it -Author Affiliation: Politecnico di Milano and University of Bologna - -## What type of a transformation is this? -This transformation acts like a perturbation to test robustness. -It replaces some random spaces with underscores. -This transformation mimics particular behaviours: -- names of folders, files, classes, functions (e.g., underscore_trick) -- trick word counters when there is a limited maximum number of words to insert in an online questionary/quiz. -Generated transformations display high similarity to the source sentences. - -## What tasks does it intend to benefit? -This perturbation would benefit all tasks which have a sentence/paragraph/document as input like text classification, text generation, etc. Especially on tasks related to understanding/generating scripts. - -## What are the limitations of this transformation? -The transformation's outputs are extremely simple to be used for data augmentation. diff --git a/transformations/underscore_trick/__init__.py b/transformations/underscore_trick/__init__.py deleted file mode 100644 index 930cdce0b..000000000 --- a/transformations/underscore_trick/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .transformation import * diff --git a/transformations/underscore_trick/test.json b/transformations/underscore_trick/test.json deleted file mode 100644 index 29640c164..000000000 --- a/transformations/underscore_trick/test.json +++ /dev/null @@ -1,50 +0,0 @@ -{ - "type": "underscore_trick", - "test_cases": [ - { - "class": "UnderscoreTrick", - "inputs": { - "sentence": "Andrew finally returned the French book to Chris that I bought last week" - }, - "outputs": [{ - "sentence": "Andrew finally_returned the French book to Chris that I_bought last week" - }] - }, - { - "class": "UnderscoreTrick", - "inputs": { - "sentence": "Sentences with gapping, such as Paul likes coffee and Mary tea, lack an overt predicate to indicate the relation between two or more arguments." - }, - "outputs": [{ - "sentence": "Sentences with_gapping, such as Paul likes coffee and Mary_tea, lack an_overt predicate to indicate the relation between_two or more arguments." - }] - }, - { - "class": "UnderscoreTrick", - "inputs": { - "sentence": "Alice in Wonderland is a 2010 American live-action/animated dark fantasy adventure film" - }, - "outputs": [{ - "sentence": "Alice in_Wonderland is a 2010 American live-action/animated dark fantasy_adventure film" - }] - }, - { - "class": "UnderscoreTrick", - "inputs": { - "sentence": "Ujjal Dev Dosanjh served as 33rd Premier of British Columbia from 2000 to 2001" - }, - "outputs": [{ - "sentence": "Ujjal Dev_Dosanjh served as 33rd Premier of British Columbia_from 2000 to_2001" - }] - }, - { - "class": "UnderscoreTrick", - "inputs": { - "sentence": "Neuroplasticity is a continuous processing allowing short-term, medium-term, and long-term remodeling of the neuronosynaptic organization." - }, - "outputs": [{ - "sentence": "Neuroplasticity is_a continuous processing allowing short-term, medium-term, and long-term_remodeling of the_neuronosynaptic organization." - }] - } - ] -} diff --git a/transformations/underscore_trick/transformation.py b/transformations/underscore_trick/transformation.py deleted file mode 100644 index 4263dac83..000000000 --- a/transformations/underscore_trick/transformation.py +++ /dev/null @@ -1,42 +0,0 @@ -import itertools -import random - -from interfaces.SentenceOperation import SentenceOperation -from tasks.TaskTypes import TaskType - - -def add_underscore(text, prob=0.05, seed=42, max_outputs=1): - random.seed(seed) - - perturbed_texts = [] - for _ in itertools.repeat(None, max_outputs): - perturbed_text = "".join( - [ - letter if letter != " " or random.random() > prob else "_" - for letter in text - ] - ) - perturbed_texts.append(perturbed_text) - return perturbed_texts - - -class UnderscoreTrick(SentenceOperation): - tasks = [ - TaskType.TEXT_CLASSIFICATION, - TaskType.TEXT_TO_TEXT_GENERATION, - TaskType.TEXT_TAGGING, - ] - languages = ["All"] - - def __init__(self, seed=42, max_outputs=1, prob=0.05): - super().__init__(seed, max_outputs=max_outputs) - self.prob = prob - - def generate(self, sentence: str): - perturbed_texts = add_underscore( - text=sentence, - prob=self.prob, - seed=self.seed, - max_outputs=self.max_outputs, - ) - return perturbed_texts From 97866a7f1ea53822117a5a7c11957327d29c9802 Mon Sep 17 00:00:00 2001 From: marco-digio Date: Thu, 9 Sep 2021 16:49:26 +0200 Subject: [PATCH 4/6] Fix typo --- transformations/character_duplication/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transformations/character_duplication/README.md b/transformations/character_duplication/README.md index d01159082..01e5bb9d2 100644 --- a/transformations/character_duplication/README.md +++ b/transformations/character_duplication/README.md @@ -18,4 +18,4 @@ Generated transformations display high similarity to the source sentences. ## What are the limitations of this transformation? - This transformation is not capable of generating linguistically diverse text. -- This transformation will mainly affect the perfornamce of token/word-level models, while character-level models should be much robust. +- This transformation will mainly affect the performance of token/word-level models, while character-level models should be much robust. From 22a16e777e74d6d348421621caaac02fb7574add Mon Sep 17 00:00:00 2001 From: marco-digio Date: Thu, 9 Sep 2021 16:50:23 +0200 Subject: [PATCH 5/6] Add keywords --- transformations/character_duplication/transformation.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/transformations/character_duplication/transformation.py b/transformations/character_duplication/transformation.py index 37334e3a1..2de353572 100644 --- a/transformations/character_duplication/transformation.py +++ b/transformations/character_duplication/transformation.py @@ -29,6 +29,15 @@ class CharacterDuplication(SentenceOperation): TaskType.TEXT_TAGGING, ] languages = ["All"] + keywords = [ + "morphological", + "noise", + "rule-based", + "highly-meaning-preserving", + "high-precision", + "high-coverage", + "high-generations", + ] def __init__(self, seed=42, max_outputs=1, prob=0.1): super().__init__(seed, max_outputs=max_outputs) From eb09bbcab3d2d47742ef47e353ec315e3ab05627 Mon Sep 17 00:00:00 2001 From: marco-digio Date: Wed, 29 Sep 2021 11:02:07 +0200 Subject: [PATCH 6/6] Fix tasks, add doc string and remove digits from being duplicated --- transformations/character_duplication/test.json | 4 ++-- .../character_duplication/transformation.py | 11 +++++++---- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/transformations/character_duplication/test.json b/transformations/character_duplication/test.json index c0f2d03d1..613314fe2 100644 --- a/transformations/character_duplication/test.json +++ b/transformations/character_duplication/test.json @@ -25,7 +25,7 @@ "sentence": "Alice in Wonderland is a 2010 American live-action/animated dark fantasy adventure film" }, "outputs": [{ - "sentence": "Allice inn WWondderland is a 200110 American livve-aaction/animated dark fanntasy adventure film" + "sentence": "Allice inn WWondderland is a 2010 AAmmerican live-acctioon/animated dark fantasyy adventure film" }] }, { @@ -34,7 +34,7 @@ "sentence": "Ujjal Dev Dosanjh served as 33rd Premier of British Columbia from 2000 to 2001" }, "outputs": [{ - "sentence": "Ujjjal Deev Dossanjh seerved ass 33rd Premier oof BBritish Columbia from 20000 to 2001" + "sentence": "Ujjjal Deev Dossanjh seerved ass 33rd Premier of Briitish Columbia from 2000 to 2001" }] }, { diff --git a/transformations/character_duplication/transformation.py b/transformations/character_duplication/transformation.py index 2de353572..9678e4206 100644 --- a/transformations/character_duplication/transformation.py +++ b/transformations/character_duplication/transformation.py @@ -1,4 +1,3 @@ -import itertools import random from interfaces.SentenceOperation import SentenceOperation @@ -6,13 +5,18 @@ def duplicate(text, prob=0.1, seed=42, max_outputs=1): + """ + This function duplicates random chars (not digits) in the text string, with specified probability. It returns a list of different perturbed strings, whose length is specified by max_outputs. + """ random.seed(seed) original_text = list(text) perturbed_texts = [] - for _ in itertools.repeat(None, max_outputs): + for _ in range(max_outputs): perturbed_text = [ - [letter] if random.random() > prob else [letter, letter] + [letter] + if letter.isdigit() or random.random() > prob + else [letter, letter] for letter in original_text ] perturbed_text = [ @@ -26,7 +30,6 @@ class CharacterDuplication(SentenceOperation): tasks = [ TaskType.TEXT_CLASSIFICATION, TaskType.TEXT_TO_TEXT_GENERATION, - TaskType.TEXT_TAGGING, ] languages = ["All"] keywords = [