From 03b31e2e9e459c22aef533f513613ff606e6c2a0 Mon Sep 17 00:00:00 2001
From: Marco Di Giovanni <marco@MacBook-Pro-di-Marco.local>
Date: Thu, 29 Jul 2021 16:51:48 +0200
Subject: [PATCH 1/6] Added underscore_trick

---
 transformations/underscore_trick/README.md    | 20 ++++++++
 transformations/underscore_trick/__init__.py  |  1 +
 transformations/underscore_trick/test.json    | 50 +++++++++++++++++++
 .../underscore_trick/transformation.py        | 42 ++++++++++++++++
 4 files changed, 113 insertions(+)
 create mode 100644 transformations/underscore_trick/README.md
 create mode 100644 transformations/underscore_trick/__init__.py
 create mode 100644 transformations/underscore_trick/test.json
 create mode 100644 transformations/underscore_trick/transformation.py

diff --git a/transformations/underscore_trick/README.md b/transformations/underscore_trick/README.md
new file mode 100644
index 000000000..af0561df6
--- /dev/null
+++ b/transformations/underscore_trick/README.md
@@ -0,0 +1,20 @@
+# Underscore Trick
+This perturbation adds noise to all types of text sources (sentence, paragraph, etc.).
+
+Author name: Marco Di Giovanni
+Author email: marco.digiovanni@polimi.it
+Author Affiliation: Politecnico di Milano and University of Bologna
+
+## What type of a transformation is this?
+This transformation acts like a perturbation to test robustness.
+It replaces some random spaces with underscores.
+This transformation mimics particular behaviours:
+- names of folders, files, classes, functions (e.g., underscore_trick)
+- trick word counters when there is a limited maximum number of words to insert in an online questionary/quiz.
+Generated transformations display high similarity to the source sentences.
+
+## What tasks does it intend to benefit?
+This perturbation would benefit all tasks which have a sentence/paragraph/document as input like text classification, text generation, etc. Especially on tasks related to understanding/generating scripts.
+
+## What are the limitations of this transformation?
+The transformation's outputs are extremely simple to be used for data augmentation.
diff --git a/transformations/underscore_trick/__init__.py b/transformations/underscore_trick/__init__.py
new file mode 100644
index 000000000..930cdce0b
--- /dev/null
+++ b/transformations/underscore_trick/__init__.py
@@ -0,0 +1 @@
+from .transformation import *
diff --git a/transformations/underscore_trick/test.json b/transformations/underscore_trick/test.json
new file mode 100644
index 000000000..29640c164
--- /dev/null
+++ b/transformations/underscore_trick/test.json
@@ -0,0 +1,50 @@
+{
+  "type": "underscore_trick",
+  "test_cases": [
+    {
+      "class": "UnderscoreTrick",
+      "inputs": {
+        "sentence": "Andrew finally returned the French book to Chris that I bought last week"
+      },
+      "outputs": [{
+        "sentence": "Andrew finally_returned the French book to Chris that I_bought last week"
+      }]
+    },
+    {
+      "class": "UnderscoreTrick",
+      "inputs": {
+        "sentence": "Sentences with gapping, such as Paul likes coffee and Mary tea, lack an overt predicate to indicate the relation between two or more arguments."
+      },
+      "outputs": [{
+        "sentence": "Sentences with_gapping, such as Paul likes coffee and Mary_tea, lack an_overt predicate to indicate the relation between_two or more arguments."
+      }]
+    },
+    {
+      "class": "UnderscoreTrick",
+      "inputs": {
+        "sentence": "Alice in Wonderland is a 2010 American live-action/animated dark fantasy adventure film"
+      },
+      "outputs": [{
+        "sentence": "Alice in_Wonderland is a 2010 American live-action/animated dark fantasy_adventure film"
+      }]
+    },
+    {
+      "class": "UnderscoreTrick",
+      "inputs": {
+        "sentence": "Ujjal Dev Dosanjh served as 33rd Premier of British Columbia from 2000 to 2001"
+      },
+      "outputs": [{
+        "sentence": "Ujjal Dev_Dosanjh served as 33rd Premier of British Columbia_from 2000 to_2001"
+      }]
+    },
+    {
+      "class": "UnderscoreTrick",
+      "inputs": {
+        "sentence": "Neuroplasticity is a continuous processing allowing short-term, medium-term, and long-term remodeling of the neuronosynaptic organization."
+      },
+      "outputs": [{
+        "sentence": "Neuroplasticity is_a continuous processing allowing short-term, medium-term, and long-term_remodeling of the_neuronosynaptic organization."
+      }]
+    }
+  ]
+}
diff --git a/transformations/underscore_trick/transformation.py b/transformations/underscore_trick/transformation.py
new file mode 100644
index 000000000..4263dac83
--- /dev/null
+++ b/transformations/underscore_trick/transformation.py
@@ -0,0 +1,42 @@
+import itertools
+import random
+
+from interfaces.SentenceOperation import SentenceOperation
+from tasks.TaskTypes import TaskType
+
+
+def add_underscore(text, prob=0.05, seed=42, max_outputs=1):
+    random.seed(seed)
+
+    perturbed_texts = []
+    for _ in itertools.repeat(None, max_outputs):
+        perturbed_text = "".join(
+            [
+                letter if letter != " " or random.random() > prob else "_"
+                for letter in text
+            ]
+        )
+        perturbed_texts.append(perturbed_text)
+    return perturbed_texts
+
+
+class UnderscoreTrick(SentenceOperation):
+    tasks = [
+        TaskType.TEXT_CLASSIFICATION,
+        TaskType.TEXT_TO_TEXT_GENERATION,
+        TaskType.TEXT_TAGGING,
+    ]
+    languages = ["All"]
+
+    def __init__(self, seed=42, max_outputs=1, prob=0.05):
+        super().__init__(seed, max_outputs=max_outputs)
+        self.prob = prob
+
+    def generate(self, sentence: str):
+        perturbed_texts = add_underscore(
+            text=sentence,
+            prob=self.prob,
+            seed=self.seed,
+            max_outputs=self.max_outputs,
+        )
+        return perturbed_texts

From 4cdd971def388dfea87340f0090d37051ea2c4cc Mon Sep 17 00:00:00 2001
From: Marco Di Giovanni <marco@MacBook-Pro-di-Marco.local>
Date: Fri, 30 Jul 2021 09:31:32 +0200
Subject: [PATCH 2/6] Added character_duplication

---
 .../character_duplication/README.md           | 21 ++++++++
 .../character_duplication/__init__.py         |  1 +
 .../character_duplication/test.json           | 50 +++++++++++++++++++
 .../character_duplication/transformation.py   | 44 ++++++++++++++++
 4 files changed, 116 insertions(+)
 create mode 100644 transformations/character_duplication/README.md
 create mode 100644 transformations/character_duplication/__init__.py
 create mode 100644 transformations/character_duplication/test.json
 create mode 100644 transformations/character_duplication/transformation.py

diff --git a/transformations/character_duplication/README.md b/transformations/character_duplication/README.md
new file mode 100644
index 000000000..d01159082
--- /dev/null
+++ b/transformations/character_duplication/README.md
@@ -0,0 +1,21 @@
+# Character Duplication
+This perturbation adds noise to all types of text sources (sentence, paragraph, etc.) proportional to noise erupting from keyboard typos making common spelling errors.
+
+Author name: Marco Di Giovanni
+Author email: marco.digiovanni@polimi.it
+Author Affiliation: Politecnico di Milano and University of Bologna
+
+
+
+## What type of a transformation is this?
+This transformation acts like a perturbation to test robustness.
+Few letters picked at random are duplicated.
+Generated transformations display high similarity to the source sentences.
+
+## What tasks does it intend to benefit?
+- This perturbation would benefit all tasks which have a sentence/paragraph/document as input like text classification, text generation, etc.
+- The generated texts mimic typing mistakes.
+
+## What are the limitations of this transformation?
+- This transformation is not capable of generating linguistically diverse text.
+- This transformation will mainly affect the perfornamce of token/word-level models, while character-level models should be much robust.
diff --git a/transformations/character_duplication/__init__.py b/transformations/character_duplication/__init__.py
new file mode 100644
index 000000000..930cdce0b
--- /dev/null
+++ b/transformations/character_duplication/__init__.py
@@ -0,0 +1 @@
+from .transformation import *
diff --git a/transformations/character_duplication/test.json b/transformations/character_duplication/test.json
new file mode 100644
index 000000000..c0f2d03d1
--- /dev/null
+++ b/transformations/character_duplication/test.json
@@ -0,0 +1,50 @@
+{
+  "type": "character_duplication",
+  "test_cases": [
+    {
+      "class": "CharacterDuplication",
+      "inputs": {
+        "sentence": "Andrew finally returned the French book to Chris that I bought last week"
+      },
+      "outputs": [{
+        "sentence": "Anndrew ffinnallly returrned thee  French book too Chhris that I bought last  week"
+      }]
+    },
+    {
+      "class": "CharacterDuplication",
+      "inputs": {
+        "sentence": "Sentences with gapping, such as Paul likes coffee and Mary tea, lack an overt predicate to indicate the relation between two or more arguments."
+      },
+      "outputs": [{
+        "sentence": "Seentencees  witth gappiing, succhh as Paul likess cooffee and Mary tea, lackk an overt predicate ttoo indiicate tthe relation between two  orr moree arrguuments."
+      }]
+    },
+    {
+      "class": "CharacterDuplication",
+      "inputs": {
+        "sentence": "Alice in Wonderland is a 2010 American live-action/animated dark fantasy adventure film"
+      },
+      "outputs": [{
+        "sentence": "Allice inn WWondderland  is a 200110 American livve-aaction/animated dark fanntasy adventure film"
+      }]
+    },
+    {
+      "class": "CharacterDuplication",
+      "inputs": {
+        "sentence": "Ujjal Dev Dosanjh served as 33rd Premier of British Columbia from 2000 to 2001"
+      },
+      "outputs": [{
+        "sentence": "Ujjjal Deev  Dossanjh seerved ass  33rd Premier oof BBritish Columbia from 20000 to 2001"
+      }]
+    },
+    {
+      "class": "CharacterDuplication",
+      "inputs": {
+        "sentence": "Neuroplasticity is a continuous processing allowing short-term, medium-term, and long-term remodeling of the neuronosynaptic organization."
+      },
+      "outputs": [{
+        "sentence": "Neeuroplaastticiity is aa continnuuous processingg alllowing short-term, mediium-term, and long-terrmm remoodelingg of the neuronosynaptic  orrganizzatiionn."
+      }]
+    }
+  ]
+}
diff --git a/transformations/character_duplication/transformation.py b/transformations/character_duplication/transformation.py
new file mode 100644
index 000000000..37334e3a1
--- /dev/null
+++ b/transformations/character_duplication/transformation.py
@@ -0,0 +1,44 @@
+import itertools
+import random
+
+from interfaces.SentenceOperation import SentenceOperation
+from tasks.TaskTypes import TaskType
+
+
+def duplicate(text, prob=0.1, seed=42, max_outputs=1):
+    random.seed(seed)
+
+    original_text = list(text)
+    perturbed_texts = []
+    for _ in itertools.repeat(None, max_outputs):
+        perturbed_text = [
+            [letter] if random.random() > prob else [letter, letter]
+            for letter in original_text
+        ]
+        perturbed_text = [
+            letter for sublist in perturbed_text for letter in sublist
+        ]
+        perturbed_texts.append("".join(perturbed_text))
+    return perturbed_texts
+
+
+class CharacterDuplication(SentenceOperation):
+    tasks = [
+        TaskType.TEXT_CLASSIFICATION,
+        TaskType.TEXT_TO_TEXT_GENERATION,
+        TaskType.TEXT_TAGGING,
+    ]
+    languages = ["All"]
+
+    def __init__(self, seed=42, max_outputs=1, prob=0.1):
+        super().__init__(seed, max_outputs=max_outputs)
+        self.prob = prob
+
+    def generate(self, sentence: str):
+        perturbed_texts = duplicate(
+            text=sentence,
+            prob=self.prob,
+            seed=self.seed,
+            max_outputs=self.max_outputs,
+        )
+        return perturbed_texts

From 92da719612c3be2a31b3e26d5128290d5b98fe90 Mon Sep 17 00:00:00 2001
From: marco-digio <marco.28.digio@gmail.com>
Date: Wed, 4 Aug 2021 10:29:15 +0200
Subject: [PATCH 3/6] Removed wrong files (different branch)

---
 transformations/underscore_trick/README.md    | 20 --------
 transformations/underscore_trick/__init__.py  |  1 -
 transformations/underscore_trick/test.json    | 50 -------------------
 .../underscore_trick/transformation.py        | 42 ----------------
 4 files changed, 113 deletions(-)
 delete mode 100644 transformations/underscore_trick/README.md
 delete mode 100644 transformations/underscore_trick/__init__.py
 delete mode 100644 transformations/underscore_trick/test.json
 delete mode 100644 transformations/underscore_trick/transformation.py

diff --git a/transformations/underscore_trick/README.md b/transformations/underscore_trick/README.md
deleted file mode 100644
index af0561df6..000000000
--- a/transformations/underscore_trick/README.md
+++ /dev/null
@@ -1,20 +0,0 @@
-# Underscore Trick
-This perturbation adds noise to all types of text sources (sentence, paragraph, etc.).
-
-Author name: Marco Di Giovanni
-Author email: marco.digiovanni@polimi.it
-Author Affiliation: Politecnico di Milano and University of Bologna
-
-## What type of a transformation is this?
-This transformation acts like a perturbation to test robustness.
-It replaces some random spaces with underscores.
-This transformation mimics particular behaviours:
-- names of folders, files, classes, functions (e.g., underscore_trick)
-- trick word counters when there is a limited maximum number of words to insert in an online questionary/quiz.
-Generated transformations display high similarity to the source sentences.
-
-## What tasks does it intend to benefit?
-This perturbation would benefit all tasks which have a sentence/paragraph/document as input like text classification, text generation, etc. Especially on tasks related to understanding/generating scripts.
-
-## What are the limitations of this transformation?
-The transformation's outputs are extremely simple to be used for data augmentation.
diff --git a/transformations/underscore_trick/__init__.py b/transformations/underscore_trick/__init__.py
deleted file mode 100644
index 930cdce0b..000000000
--- a/transformations/underscore_trick/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .transformation import *
diff --git a/transformations/underscore_trick/test.json b/transformations/underscore_trick/test.json
deleted file mode 100644
index 29640c164..000000000
--- a/transformations/underscore_trick/test.json
+++ /dev/null
@@ -1,50 +0,0 @@
-{
-  "type": "underscore_trick",
-  "test_cases": [
-    {
-      "class": "UnderscoreTrick",
-      "inputs": {
-        "sentence": "Andrew finally returned the French book to Chris that I bought last week"
-      },
-      "outputs": [{
-        "sentence": "Andrew finally_returned the French book to Chris that I_bought last week"
-      }]
-    },
-    {
-      "class": "UnderscoreTrick",
-      "inputs": {
-        "sentence": "Sentences with gapping, such as Paul likes coffee and Mary tea, lack an overt predicate to indicate the relation between two or more arguments."
-      },
-      "outputs": [{
-        "sentence": "Sentences with_gapping, such as Paul likes coffee and Mary_tea, lack an_overt predicate to indicate the relation between_two or more arguments."
-      }]
-    },
-    {
-      "class": "UnderscoreTrick",
-      "inputs": {
-        "sentence": "Alice in Wonderland is a 2010 American live-action/animated dark fantasy adventure film"
-      },
-      "outputs": [{
-        "sentence": "Alice in_Wonderland is a 2010 American live-action/animated dark fantasy_adventure film"
-      }]
-    },
-    {
-      "class": "UnderscoreTrick",
-      "inputs": {
-        "sentence": "Ujjal Dev Dosanjh served as 33rd Premier of British Columbia from 2000 to 2001"
-      },
-      "outputs": [{
-        "sentence": "Ujjal Dev_Dosanjh served as 33rd Premier of British Columbia_from 2000 to_2001"
-      }]
-    },
-    {
-      "class": "UnderscoreTrick",
-      "inputs": {
-        "sentence": "Neuroplasticity is a continuous processing allowing short-term, medium-term, and long-term remodeling of the neuronosynaptic organization."
-      },
-      "outputs": [{
-        "sentence": "Neuroplasticity is_a continuous processing allowing short-term, medium-term, and long-term_remodeling of the_neuronosynaptic organization."
-      }]
-    }
-  ]
-}
diff --git a/transformations/underscore_trick/transformation.py b/transformations/underscore_trick/transformation.py
deleted file mode 100644
index 4263dac83..000000000
--- a/transformations/underscore_trick/transformation.py
+++ /dev/null
@@ -1,42 +0,0 @@
-import itertools
-import random
-
-from interfaces.SentenceOperation import SentenceOperation
-from tasks.TaskTypes import TaskType
-
-
-def add_underscore(text, prob=0.05, seed=42, max_outputs=1):
-    random.seed(seed)
-
-    perturbed_texts = []
-    for _ in itertools.repeat(None, max_outputs):
-        perturbed_text = "".join(
-            [
-                letter if letter != " " or random.random() > prob else "_"
-                for letter in text
-            ]
-        )
-        perturbed_texts.append(perturbed_text)
-    return perturbed_texts
-
-
-class UnderscoreTrick(SentenceOperation):
-    tasks = [
-        TaskType.TEXT_CLASSIFICATION,
-        TaskType.TEXT_TO_TEXT_GENERATION,
-        TaskType.TEXT_TAGGING,
-    ]
-    languages = ["All"]
-
-    def __init__(self, seed=42, max_outputs=1, prob=0.05):
-        super().__init__(seed, max_outputs=max_outputs)
-        self.prob = prob
-
-    def generate(self, sentence: str):
-        perturbed_texts = add_underscore(
-            text=sentence,
-            prob=self.prob,
-            seed=self.seed,
-            max_outputs=self.max_outputs,
-        )
-        return perturbed_texts

From 97866a7f1ea53822117a5a7c11957327d29c9802 Mon Sep 17 00:00:00 2001
From: marco-digio <marco.28.digio@gmail.com>
Date: Thu, 9 Sep 2021 16:49:26 +0200
Subject: [PATCH 4/6] Fix typo

---
 transformations/character_duplication/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/transformations/character_duplication/README.md b/transformations/character_duplication/README.md
index d01159082..01e5bb9d2 100644
--- a/transformations/character_duplication/README.md
+++ b/transformations/character_duplication/README.md
@@ -18,4 +18,4 @@ Generated transformations display high similarity to the source sentences.
 
 ## What are the limitations of this transformation?
 - This transformation is not capable of generating linguistically diverse text.
-- This transformation will mainly affect the perfornamce of token/word-level models, while character-level models should be much robust.
+- This transformation will mainly affect the performance of token/word-level models, while character-level models should be much robust.

From 22a16e777e74d6d348421621caaac02fb7574add Mon Sep 17 00:00:00 2001
From: marco-digio <marco.28.digio@gmail.com>
Date: Thu, 9 Sep 2021 16:50:23 +0200
Subject: [PATCH 5/6] Add keywords

---
 transformations/character_duplication/transformation.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/transformations/character_duplication/transformation.py b/transformations/character_duplication/transformation.py
index 37334e3a1..2de353572 100644
--- a/transformations/character_duplication/transformation.py
+++ b/transformations/character_duplication/transformation.py
@@ -29,6 +29,15 @@ class CharacterDuplication(SentenceOperation):
         TaskType.TEXT_TAGGING,
     ]
     languages = ["All"]
+    keywords = [
+        "morphological",
+        "noise",
+        "rule-based",
+        "highly-meaning-preserving",
+        "high-precision",
+        "high-coverage",
+        "high-generations",
+    ]
 
     def __init__(self, seed=42, max_outputs=1, prob=0.1):
         super().__init__(seed, max_outputs=max_outputs)

From eb09bbcab3d2d47742ef47e353ec315e3ab05627 Mon Sep 17 00:00:00 2001
From: marco-digio <marco.28.digio@gmail.com>
Date: Wed, 29 Sep 2021 11:02:07 +0200
Subject: [PATCH 6/6] Fix tasks, add doc string and remove digits from being
 duplicated

---
 transformations/character_duplication/test.json       |  4 ++--
 .../character_duplication/transformation.py           | 11 +++++++----
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/transformations/character_duplication/test.json b/transformations/character_duplication/test.json
index c0f2d03d1..613314fe2 100644
--- a/transformations/character_duplication/test.json
+++ b/transformations/character_duplication/test.json
@@ -25,7 +25,7 @@
         "sentence": "Alice in Wonderland is a 2010 American live-action/animated dark fantasy adventure film"
       },
       "outputs": [{
-        "sentence": "Allice inn WWondderland  is a 200110 American livve-aaction/animated dark fanntasy adventure film"
+        "sentence": "Allice inn WWondderland  is a 2010 AAmmerican live-acctioon/animated dark fantasyy adventure film"
       }]
     },
     {
@@ -34,7 +34,7 @@
         "sentence": "Ujjal Dev Dosanjh served as 33rd Premier of British Columbia from 2000 to 2001"
       },
       "outputs": [{
-        "sentence": "Ujjjal Deev  Dossanjh seerved ass  33rd Premier oof BBritish Columbia from 20000 to 2001"
+        "sentence": "Ujjjal Deev  Dossanjh seerved ass  33rd Premier of  Briitish Columbia from 2000 to  2001"
       }]
     },
     {
diff --git a/transformations/character_duplication/transformation.py b/transformations/character_duplication/transformation.py
index 2de353572..9678e4206 100644
--- a/transformations/character_duplication/transformation.py
+++ b/transformations/character_duplication/transformation.py
@@ -1,4 +1,3 @@
-import itertools
 import random
 
 from interfaces.SentenceOperation import SentenceOperation
@@ -6,13 +5,18 @@
 
 
 def duplicate(text, prob=0.1, seed=42, max_outputs=1):
+    """
+    This function duplicates random chars (not digits) in the text string, with specified probability. It returns a list of different perturbed strings, whose length is specified by max_outputs.
+    """
     random.seed(seed)
 
     original_text = list(text)
     perturbed_texts = []
-    for _ in itertools.repeat(None, max_outputs):
+    for _ in range(max_outputs):
         perturbed_text = [
-            [letter] if random.random() > prob else [letter, letter]
+            [letter]
+            if letter.isdigit() or random.random() > prob
+            else [letter, letter]
             for letter in original_text
         ]
         perturbed_text = [
@@ -26,7 +30,6 @@ class CharacterDuplication(SentenceOperation):
     tasks = [
         TaskType.TEXT_CLASSIFICATION,
         TaskType.TEXT_TO_TEXT_GENERATION,
-        TaskType.TEXT_TAGGING,
     ]
     languages = ["All"]
     keywords = [