deel-ai
diff --git a/‎tests/attributions/test_nlp_occlusion.py
+47 b/‎tests/attributions/test_nlp_occlusion.py
+47
diff --git a/‎tests/nlp/__init__.py b/‎tests/nlp/__init__.py
diff --git a/‎tests/nlp/test_token_extractor.py
+143 b/‎tests/nlp/test_token_extractor.py
+143
diff --git a/‎xplique/attributions/__init__.py
+1 b/‎xplique/attributions/__init__.py
+1
diff --git a/‎xplique/attributions/nlp_occlusion.py
+106 b/‎xplique/attributions/nlp_occlusion.py
+106
diff --git a/‎xplique/commons/__init__.py
+1 b/‎xplique/commons/__init__.py
+1
@@ -0,0 +1,47 @@
+"""
+Test object detection BoundingBoxesExplainer
+"""
+import numpy as np
+
+from xplique.attributions import NlpOcclusion
+
+def test_masks():
+    """Test the masks creation"""
+    sentence = "aaa bbb ccc"
+    words = sentence.split(" ")
+    masks = NlpOcclusion._get_masks(words)
+    assert masks.shape == (len(words), len(words))
+    expected_mask = np.array([[False, True, True],
+                              [True, False, True],
+                              [True, True, False]])
+
+    assert np.array_equal(masks, expected_mask)
+
+def test_apply_masks():
+    """Test if the application of a mask generate valid results"""
+    sentence = "aaa bbb ccc"
+    words = sentence.split(" ")
+    masks = NlpOcclusion._get_masks(words)
+
+    occluded_inputs = NlpOcclusion._apply_masks(words, masks)
+    expected_occludec_inputs = [['bbb', 'ccc'], ['aaa', 'ccc'], ['aaa', 'bbb']]
+    assert np.array_equal(occluded_inputs, expected_occludec_inputs)
+
+def test_output_shape():
+    """Test the output shape for several input sentences"""
+
+    nb_concepts = 10
+
+    def transform(inputs):
+        # simulate the transorm method used in Craft/Cockatiel
+        return np.ones((len(inputs), nb_concepts))
+
+    input_sentence = ["aaa bbb ccc ddd eee fff", "ggg hhh iii jjj"]
+    for sentence in input_sentence:
+        words = sentence.split(" ")
+        separator = " "
+
+        method = NlpOcclusion(model=transform)
+        sensitivity = method.explain(sentence, words, separator)
+
+        assert sensitivity.shape == (nb_concepts, len(words))
@@ -0,0 +1,143 @@
+from xplique.commons.nlp import WordExtractor, SentenceExtractor
+from xplique.commons.nlp import ClauseExtractor, ExcerptExtractor, ExtractorFactory
+
+import pytest
+
+@pytest.fixture
+def example_sentence():
+    return "One two three. Second sentence.Third Sentence, test1, test2; test3-test4 .GO!"\
+           " Trust me,. sentence not starting with capital letter."\
+           " Sentence with dots..Word, Word, word,...word ....so a sentence"
+
+def test_word_extractor(example_sentence):
+    extractor = WordExtractor()
+    tokens, separator = extractor.extract_tokens(example_sentence)
+    assert isinstance(tokens, list)
+    assert isinstance(separator, str)
+    assert separator == ' '
+    expected_tokens = [ 'One', 'two', 'three', '.',
+                        'Second', 'sentence.Third', 'Sentence', ',', 'test1', ',', 'test2', ';',
+                        'test3-test4', '.GO', '!', 'Trust', 'me', ',', '.', 'sentence', 'not',
+                        'starting','with', 'capital', 'letter', '.', 'Sentence', 'with', 'dots',
+                        '..', 'Word', ',', 'Word', ',', 'word', ',', '...', 'word', '....', 'so',
+                        'a', 'sentence']
+    assert tokens == expected_tokens, print('tokens:', tokens)
+
+def test_word_extractor_ignore_words(example_sentence):
+    extractor = WordExtractor(ignore_words = ['me', 'not', 'so', 'a',])
+    tokens, separator = extractor.extract_tokens(example_sentence)
+    assert isinstance(tokens, list)
+    assert isinstance(separator, str)
+    assert separator == ' '
+    expected_tokens = [ 'One', 'two', 'three', '.',
+                        'Second', 'sentence.Third', 'Sentence', ',', 'test1', ',', 'test2', ';',
+                        'test3-test4', '.GO', '!', 'Trust', ',', '.', 'sentence', 'starting',
+                        'with','capital', 'letter', '.', 'Sentence', 'with', 'dots', '..',
+                        'Word', ',', 'Word', ',', 'word', ',', '...', 'word', '....', 'sentence']
+    assert tokens == expected_tokens, print('tokens:', tokens)
+
+def test_word_extractor_from_list(example_sentence):
+    extractor = WordExtractor()
+    tokens, separator = extractor.extract_tokens([example_sentence, example_sentence])
+    assert isinstance(tokens, list)
+    assert isinstance(separator, str)
+    assert separator == ' '
+
+def test_sentence_extractor(example_sentence):
+    extractor = SentenceExtractor()
+    tokens, separator = extractor.extract_tokens(example_sentence)
+    assert isinstance(tokens, list)
+    assert isinstance(separator, str)
+    assert separator == '. '
+    expected_tokens = [ 'One two three.',
+                        'Second sentence.Third Sentence, test1, test2; test3-test4 .GO!',
+                        'Trust me,.',
+                        'sentence not starting with capital letter.',
+                        'Sentence with dots..Word, Word, word,...word ....so a sentence']
+    assert tokens == expected_tokens, print('tokens:', tokens)
+
+def test_excerpt_extractor(example_sentence):
+    extractor = ExcerptExtractor()
+    tokens, separator = extractor.extract_tokens(example_sentence)
+    assert isinstance(tokens, list)
+    assert isinstance(separator, str)
+    assert separator == ' '
+    expected_tokens = [ 'One two three.',
+                        'Second sentence.',
+                        'Third Sentence, test1, test2; test3-test4 .',
+                        'GO!',
+                        'Trust me,.',
+                        'Sentence with dots.',
+                        'Word, Word, word,.']
+    assert tokens == expected_tokens, print('tokens:', tokens)
+
+def test_clause_extractor_close_type_none(example_sentence):
+    clause_extractor = ClauseExtractor(clause_type = None)
+    tokens, separator = clause_extractor.extract_tokens(example_sentence)
+    assert isinstance(tokens, list)
+    assert isinstance(separator, str)
+    expected_tokens = [ 'One two three',
+                        'Second sentence.Third Sentence',
+                        'test1',
+                        'test2',
+                        'test3-test4',
+                        'GO',
+                        'Trust',
+                        'me',
+                        'sentence',
+                        'not starting',
+                        'with',
+                        'capital letter',
+                        'Sentence',
+                        'with',
+                        'dots.',
+                        'Word',
+                        'Word, word,...word',
+                        'a sentence']
+    assert tokens == expected_tokens
+
+
+def test_clause_extractor_close_type_NP(example_sentence):
+    clause_extractor = ClauseExtractor(clause_type = ['NP'])
+    tokens, separator = clause_extractor.extract_tokens(example_sentence)
+    assert isinstance(tokens, list)
+    assert isinstance(separator, str)
+    expected_tokens = [ 'One two three',
+                        'Second sentence.Third Sentence',
+                        'test1',
+                        'test2',
+                        'test3-test4',
+                        'me',
+                        'sentence',
+                        'capital letter',
+                        'Sentence',
+                        'dots.',
+                        'Word',
+                        'Word, word,...word',
+                        'a sentence']
+    assert tokens == expected_tokens
+
+def test_clause_extractor_close_type_ADJP(example_sentence):
+    clause_extractor = ClauseExtractor(clause_type = ['ADJP'])
+    tokens, separator = clause_extractor.extract_tokens(example_sentence)
+    assert isinstance(tokens, list)
+    assert isinstance(separator, str)
+    print(tokens)
+    expected_tokens = []
+    assert tokens == expected_tokens
+
+def test_extractor_factory():
+    word_extractor = ExtractorFactory.get_extractor(extract_fct="word")
+    assert isinstance(word_extractor, WordExtractor)
+
+    sentence_extractor = ExtractorFactory.get_extractor(extract_fct="sentence")
+    assert isinstance(sentence_extractor, SentenceExtractor)
+
+    excerpt_extractor = ExtractorFactory.get_extractor(extract_fct="excerpt")
+    assert isinstance(excerpt_extractor, ExcerptExtractor)
+
+    clause_extractor = ExtractorFactory.get_extractor(extract_fct="clause", clause_type=['NP'])
+    assert isinstance(clause_extractor, ClauseExtractor)
+
+    with pytest.raises(ValueError):
+        ExtractorFactory.get_extractor(extract_fct="invalid")
@@ -16,4 +16,5 @@
 from .object_detector import BoundingBoxesExplainer
 from .global_sensitivity_analysis import SobolAttributionMethod, HsicAttributionMethod
 from .gradient_statistics import SmoothGrad, VarGrad, SquareGrad
+from .nlp_occlusion import NlpOcclusion
 from . import global_sensitivity_analysis
@@ -0,0 +1,106 @@
+"""
+Module related to Occlusion sensitivity method for NLP.
+"""
+
+import numpy as np
+
+from .base import BlackBoxExplainer
+from ..commons import Tasks
+from ..types import Callable, Union, Optional, OperatorSignature, List
+
+class NlpOcclusion(BlackBoxExplainer):
+    """
+    Occlusion class for NLP.
+    """
+    def __init__(self,
+                 model: Callable,
+                 batch_size: Optional[int] = 32,
+                 operator: Optional[Union[Tasks, str, OperatorSignature]] = None):
+        super().__init__(model, batch_size, operator)
+
+    @staticmethod
+    def _get_masks(input_len: int) -> np.ndarray:
+        """
+        Generate occlusion masks for a given input length.
+
+        Parameters
+        ----------
+        input_len : int
+            The length of the input for which occlusion masks are generated.
+            Typically it will be the number of words of a sentence.
+
+        Returns
+        -------
+        occlusion_masks : np.ndarray
+            The boolean occlusion masks, an identity matrix with False for the main diagonal.
+            This kind of mask can be used to generate n sentences,
+            each with a single distinct word removed.
+        """
+        return np.eye(input_len) == 0
+
+    @staticmethod
+    def _apply_masks(words: List[str], masks: np.ndarray) -> np.ndarray:
+        """
+        Apply occlusion masks to a list of words.
+
+        Parameters
+        ----------
+        words : List[str]
+            The list of words to which occlusion masks are applied.
+        masks : np.ndarray
+            The boolean occlusion masks to be applied.
+
+        Returns
+        -------
+        occluded_words : np.ndarray
+            The list of words with occlusion masks applied.
+        """
+        perturbated_words = [np.array(words)[mask].tolist() for mask in masks]
+        return perturbated_words
+
+    def explain(self,
+                sentence: str,
+                words: List[str],
+                separator: str) -> np.ndarray:
+        """
+        Generate an explanation for the input sentence, by providing the importance of each word.
+        The importance will be computed by successively occluding each word of the sentence and
+        studying the impact of this occlusion on the model results.
+
+        Parameters
+        ----------
+        sentence : str
+            The input sentence for which an explanation is generated.
+        words : List[str]
+            List of words used to generate the explanation. These words must be part of
+            the input sentence, the importance will be computed on this list of words
+            (i.e some words of the original sentence can be omited this way).
+        separator : str
+            The separator used to join the words after the occlusion step, so a full
+            sentence can be fed to the model.
+
+        Returns
+        -------
+        explanation : np.ndarray
+            The generated explanation of format (nb_concepts, nb_words).
+        """
+
+        # generate n sentences with a different word masked (removed) each time
+        masks = NlpOcclusion._get_masks(len(words))
+        perturbated_words = NlpOcclusion._apply_masks(words, masks)
+
+        perturbated_sentences = [sentence]
+        perturbated_sentences.extend(
+            [separator.join(perturbated_word) for perturbated_word in perturbated_words])
+
+        # transform the perturbated reviews into their concept representation
+        # u_values has shape: ((W+1) x C)
+        u_values = self.model(perturbated_sentences)
+
+        # Compute sensitivities: importances = u_value of the whole sentence - u_value of each word
+        whole_sentence_uvalues = u_values[0,:]
+        words_uvalues = u_values[1:,:]
+        l_importances = (whole_sentence_uvalues - words_uvalues).transpose()
+        l_importances /= (np.max(np.abs(l_importances)) + 1e-5)
+
+        return l_importances
@@ -11,3 +11,4 @@
                                    get_inference_function, get_gradient_functions)
 from .exceptions import no_gradients_available, raise_invalid_operator
 from .forgrad import forgrad
+from .nlp import TokenExtractor, WordExtractor, SentenceExtractor, ClauseExtractor, ExcerptExtractor, ExtractorFactory