Skip to content

Commit

Permalink
Add Generative Pseudo Labeling (#2388)
Browse files Browse the repository at this point in the history
  • Loading branch information
vblagoje authored Jun 2, 2022
1 parent 61d9429 commit e10a3fb
Show file tree
Hide file tree
Showing 14 changed files with 730 additions and 8 deletions.
2 changes: 1 addition & 1 deletion docs/_src/api/api/question_generator.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ come from earlier in the document.
#### QuestionGenerator.\_\_init\_\_

```python
def __init__(model_name_or_path="valhalla/t5-base-e2e-qg", model_version=None, num_beams=4, max_length=256, no_repeat_ngram_size=3, length_penalty=1.5, early_stopping=True, split_length=50, split_overlap=10, use_gpu=True, prompt="generate questions:", batch_size: Optional[int] = None)
def __init__(model_name_or_path="valhalla/t5-base-e2e-qg", model_version=None, num_beams=4, max_length=256, no_repeat_ngram_size=3, length_penalty=1.5, early_stopping=True, split_length=50, split_overlap=10, use_gpu=True, prompt="generate questions:", num_queries_per_doc=1, batch_size: Optional[int] = None)
```

Uses the valhalla/t5-base-e2e-qg model by default. This class supports any question generation model that is
Expand Down
39 changes: 39 additions & 0 deletions docs/_src/api/api/retriever.md
Original file line number Diff line number Diff line change
Expand Up @@ -1433,6 +1433,45 @@ Create embeddings for a list of documents.

Embeddings, one per input document

<a id="dense.EmbeddingRetriever.train"></a>

#### EmbeddingRetriever.train

```python
def train(training_data: List[Dict[str, Any]], learning_rate: float = 2e-5, n_epochs: int = 1, num_warmup_steps: int = None, batch_size: int = 16) -> None
```

Trains/adapts the underlying embedding model.

Each training data example is a dictionary with the following keys:

* question: the question string
* pos_doc: the positive document string
* neg_doc: the negative document string
* score: the score margin

**Arguments**:

- `training_data` (`List[Dict[str, Any]]`): The training data
- `learning_rate` (`float`): The learning rate
- `n_epochs` (`int`): The number of epochs
- `num_warmup_steps` (`int`): The number of warmup steps
- `batch_size` (`int (optional)`): The batch size to use for the training, defaults to 16

<a id="dense.EmbeddingRetriever.save"></a>

#### EmbeddingRetriever.save

```python
def save(save_dir: Union[Path, str]) -> None
```

Save the model to the given directory

**Arguments**:

- `save_dir` (`Union[Path, str]`): The directory where the model will be saved

<a id="text2sparql"></a>

# Module text2sparql
Expand Down
1 change: 1 addition & 0 deletions haystack/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ def __getattr__(self, attr):
retriever,
summarizer,
translator,
label_generator,
)

# Note that we ignore the ImportError here because if the user did not install
Expand Down
72 changes: 72 additions & 0 deletions haystack/json-schemas/haystack-pipeline-1.1.0.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,9 @@
{
"$ref": "#/definitions/PreProcessorComponent"
},
{
"$ref": "#/definitions/PseudoLabelGeneratorComponent"
},
{
"$ref": "#/definitions/QuestionGeneratorComponent"
},
Expand Down Expand Up @@ -2439,6 +2442,75 @@
],
"additionalProperties": false
},
"PseudoLabelGeneratorComponent": {
"type": "object",
"properties": {
"name": {
"title": "Name",
"description": "Custom name for the component. Helpful for visualization and debugging.",
"type": "string"
},
"type": {
"title": "Type",
"description": "Haystack Class name for the component.",
"type": "string",
"const": "PseudoLabelGenerator"
},
"params": {
"title": "Parameters",
"type": "object",
"properties": {
"question_producer": {
"title": "Question Producer",
"anyOf": [
{
"type": "string"
},
{
"type": "array",
"items": {
"type": "object",
"additionalProperties": {
"type": "string"
}
}
}
]
},
"retriever": {
"title": "Retriever",
"type": "string"
},
"cross_encoder_model_name_or_path": {
"title": "Cross Encoder Model Name Or Path",
"default": "cross-encoder/ms-marco-MiniLM-L-6-v2",
"type": "string"
},
"total_number_of_questions": {
"title": "Total Number Of Questions",
"default": 9223372036854775807,
"type": "integer"
},
"top_k": {
"title": "Top K",
"default": 10,
"type": "integer"
}
},
"required": [
"question_producer",
"retriever"
],
"additionalProperties": false,
"description": "Each parameter can reference other components defined in the same YAML file."
}
},
"required": [
"type",
"name"
],
"additionalProperties": false
},
"QuestionGeneratorComponent": {
"type": "object",
"properties": {
Expand Down
72 changes: 72 additions & 0 deletions haystack/json-schemas/haystack-pipeline-1.3.0.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,9 @@
{
"$ref": "#/definitions/PreProcessorComponent"
},
{
"$ref": "#/definitions/PseudoLabelGeneratorComponent"
},
{
"$ref": "#/definitions/QuestionGeneratorComponent"
},
Expand Down Expand Up @@ -2618,6 +2621,75 @@
],
"additionalProperties": false
},
"PseudoLabelGeneratorComponent": {
"type": "object",
"properties": {
"name": {
"title": "Name",
"description": "Custom name for the component. Helpful for visualization and debugging.",
"type": "string"
},
"type": {
"title": "Type",
"description": "Haystack Class name for the component.",
"type": "string",
"const": "PseudoLabelGenerator"
},
"params": {
"title": "Parameters",
"type": "object",
"properties": {
"question_producer": {
"title": "Question Producer",
"anyOf": [
{
"type": "string"
},
{
"type": "array",
"items": {
"type": "object",
"additionalProperties": {
"type": "string"
}
}
}
]
},
"retriever": {
"title": "Retriever",
"type": "string"
},
"cross_encoder_model_name_or_path": {
"title": "Cross Encoder Model Name Or Path",
"default": "cross-encoder/ms-marco-MiniLM-L-6-v2",
"type": "string"
},
"total_number_of_questions": {
"title": "Total Number Of Questions",
"default": 9223372036854775807,
"type": "integer"
},
"top_k": {
"title": "Top K",
"default": 10,
"type": "integer"
}
},
"required": [
"question_producer",
"retriever"
],
"additionalProperties": false,
"description": "Each parameter can reference other components defined in the same YAML file."
}
},
"required": [
"type",
"name"
],
"additionalProperties": false
},
"QuestionGeneratorComponent": {
"type": "object",
"properties": {
Expand Down
86 changes: 86 additions & 0 deletions haystack/json-schemas/haystack-pipeline-master.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,9 @@
{
"$ref": "#/definitions/PreProcessorComponent"
},
{
"$ref": "#/definitions/PseudoLabelGeneratorComponent"
},
{
"$ref": "#/definitions/QuestionGeneratorComponent"
},
Expand Down Expand Up @@ -3193,6 +3196,85 @@
],
"additionalProperties": false
},
"PseudoLabelGeneratorComponent": {
"type": "object",
"properties": {
"name": {
"title": "Name",
"description": "Custom name for the component. Helpful for visualization and debugging.",
"type": "string"
},
"type": {
"title": "Type",
"description": "Haystack Class name for the component.",
"type": "string",
"const": "PseudoLabelGenerator"
},
"params": {
"title": "Parameters",
"type": "object",
"properties": {
"question_producer": {
"title": "Question Producer",
"anyOf": [
{
"type": "string"
},
{
"type": "array",
"items": {
"type": "object",
"additionalProperties": {
"type": "string"
}
}
}
]
},
"retriever": {
"title": "Retriever",
"type": "string"
},
"cross_encoder_model_name_or_path": {
"title": "Cross Encoder Model Name Or Path",
"default": "cross-encoder/ms-marco-MiniLM-L-6-v2",
"type": "string"
},
"max_questions_per_document": {
"title": "Max Questions Per Document",
"default": 3,
"type": "integer"
},
"top_k": {
"title": "Top K",
"default": 50,
"type": "integer"
},
"batch_size": {
"title": "Batch Size",
"default": 4,
"type": "integer"
},
"progress_bar": {
"title": "Progress Bar",
"default": true,
"type": "boolean"
}
},
"required": [
"question_producer",
"retriever"
],
"additionalProperties": false,
"description": "Each parameter can reference other components defined in the same YAML file."
}
},
"required": [
"type",
"name"
],
"additionalProperties": false
},
"QuestionGeneratorComponent": {
"type": "object",
"properties": {
Expand Down Expand Up @@ -3254,6 +3336,10 @@
"title": "Prompt",
"default": "generate questions:"
},
"num_queries_per_doc": {
"title": "Num Queries Per Doc",
"default": 1
},
"batch_size": {
"title": "Batch Size",
"type": "integer"
Expand Down
1 change: 1 addition & 0 deletions haystack/nodes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
AzureConverter,
ParsrConverter,
)
from haystack.nodes.label_generator import PseudoLabelGenerator
from haystack.nodes.other import Docs2Answers, JoinDocuments, RouteDocuments, JoinAnswers
from haystack.nodes.preprocessor import BasePreProcessor, PreProcessor
from haystack.nodes.query_classifier import SklearnQueryClassifier, TransformersQueryClassifier
Expand Down
1 change: 1 addition & 0 deletions haystack/nodes/label_generator/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from haystack.nodes.label_generator.pseudo_label_generator import PseudoLabelGenerator
Loading

0 comments on commit e10a3fb

Please sign in to comment.