-
Notifications
You must be signed in to change notification settings - Fork 5
Typing #160
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Typing #160
Changes from all commits
fd0926c
b3b9af4
c5fa5bf
59f66b2
f8b6990
0c0d01e
5f1dd69
be84f00
d8e4e7b
6ff6406
e79b456
9c9208c
09737b8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -12,14 +12,15 @@ | |
| import gzip | ||
| import multiprocessing | ||
| import xml.etree.ElementTree | ||
| from typing import Iterator | ||
|
|
||
| __version__ = '0.2.0' | ||
|
|
||
| FRAMES_PER_SECOND = 30 | ||
| PUNCTUATION = tuple(".,:;?!()[]'") | ||
|
|
||
|
|
||
| def _parse_time_string(time_string): | ||
| def _parse_time_string(time_string: str) -> float: | ||
| """ | ||
| parses string and returns time in seconds. | ||
|
|
||
|
|
@@ -32,7 +33,7 @@ def _parse_time_string(time_string): | |
| float(frames) / FRAMES_PER_SECOND) | ||
|
|
||
|
|
||
| def read_clean_gzfile(gz_file_path, *, break_duration=2.0): | ||
| def read_clean_gzfile(gz_file_path: str, *, break_duration=2.0) -> Iterator[str]: | ||
| """ | ||
| Generator that opens and reads a gunzipped xml subtitle file, while all | ||
| xml tags and timestamps are removed. | ||
|
|
@@ -68,8 +69,10 @@ def read_clean_gzfile(gz_file_path, *, break_duration=2.0): | |
| text = word_tag.text | ||
| if text in PUNCTUATION: | ||
| words.append(text) | ||
| else: | ||
| elif text is not None: | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we have enough test coverage here? Is this the right thing to do? If yes, this should be merged into master as soon as possible as well. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We agreed that it might be good to raise an exception here when There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Now we raise a exception for null text. |
||
| words.extend((' ', text)) | ||
| else: | ||
| raise ValueError("Text content of word tag is None.") | ||
| result = ''.join(words) | ||
| result = result.strip() | ||
|
|
||
|
|
@@ -112,7 +115,7 @@ class JobParseGz(): | |
|
|
||
| """ | ||
|
|
||
| def __init__(self, break_duration): | ||
| def __init__(self, break_duration: float) -> None: | ||
| self.break_duration = break_duration | ||
|
|
||
| def run(self, filename): | ||
|
|
@@ -126,7 +129,7 @@ def run(self, filename): | |
| return (lines, not_found) | ||
|
|
||
|
|
||
| def create_corpus_from_gz(directory, outfile, *, n_threads=1, verbose=False): | ||
| def create_corpus_from_gz(directory: str, outfile: str, *, n_threads=1, verbose=False): | ||
| """ | ||
| Create a corpus file from a set of gunziped (.gz) files in a directory. | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -9,12 +9,15 @@ | |
| """ | ||
|
|
||
| import gzip | ||
| from collections import Iterator, Iterable | ||
| from collections import Iterable | ||
| from typing import Iterator, List, Optional, Tuple, Union, cast | ||
|
|
||
| import pandas as pd | ||
|
|
||
| from .types import CollectionEvent, StringEvent | ||
|
|
||
| def events_from_file(event_path, compression="gzip"): | ||
|
|
||
| def events_from_file(event_path: str, compression: Optional[str] = "gzip") -> Iterator[Tuple[List[str], List[str]]]: | ||
| """ | ||
| Yields events for all events in a gzipped event file. | ||
|
|
||
|
|
@@ -30,8 +33,8 @@ def events_from_file(event_path, compression="gzip"): | |
| ------ | ||
| cues, outcomes : list, list | ||
| a tuple of two lists containing cues and outcomes | ||
|
|
||
| """ | ||
|
|
||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We usually do not have an empty line after the docstring. |
||
| if compression == "gzip": | ||
| event_file = gzip.open(event_path, 'rt') | ||
| elif compression is None: | ||
|
|
@@ -51,8 +54,11 @@ def events_from_file(event_path, compression="gzip"): | |
| event_file.close() | ||
|
|
||
|
|
||
| def events_to_file(events, file_path, delimiter="\t", compression="gzip", | ||
| columns=("cues", "outcomes")): | ||
| def events_to_file(events: Union[Iterator[StringEvent], Iterator[CollectionEvent], pd.DataFrame], | ||
| file_path: str, | ||
| delimiter: str = "\t", | ||
| compression: Optional[str] = "gzip", | ||
| columns: Tuple[str, str] = ("cues", "outcomes")) -> None: | ||
| """ | ||
| Writes events to a file | ||
|
|
||
|
|
@@ -75,9 +81,11 @@ def events_to_file(events, file_path, delimiter="\t", compression="gzip", | |
|
|
||
| """ | ||
| if isinstance(events, pd.DataFrame): | ||
| events = events_from_dataframe(events) | ||
| collection_events = events_from_dataframe(events) | ||
| elif isinstance(events, (Iterator, Iterable)): | ||
| events = events_from_list(events) | ||
| collection_events = events_from_list(cast(Union[Iterator[StringEvent], | ||
| Iterator[CollectionEvent]], | ||
| events)) | ||
| else: | ||
| raise ValueError("events should either be a pd.DataFrame or an Iterator or an Iterable.") | ||
|
|
||
|
|
@@ -91,7 +99,7 @@ def events_to_file(events, file_path, delimiter="\t", compression="gzip", | |
| try: | ||
| out_file.write("{}\n".format(delimiter.join(columns))) | ||
|
|
||
| for cues, outcomes in events: | ||
| for cues, outcomes in collection_events: | ||
| if isinstance(cues, list) and isinstance(outcomes, list): | ||
| line = "{}{}{}\n".format("_".join(cues), | ||
| delimiter, | ||
|
|
@@ -105,7 +113,8 @@ def events_to_file(events, file_path, delimiter="\t", compression="gzip", | |
| out_file.close() | ||
|
|
||
|
|
||
| def events_from_dataframe(df, columns=("cues", "outcomes")): | ||
| def events_from_dataframe(df: pd.DataFrame, | ||
| columns: Tuple[str, str] = ("cues", "outcomes")) -> Iterator[CollectionEvent]: | ||
| """ | ||
| Yields events for all events in a pandas dataframe. | ||
|
|
||
|
|
@@ -130,7 +139,7 @@ def events_from_dataframe(df, columns=("cues", "outcomes")): | |
| yield (cues, outcomes) | ||
|
|
||
|
|
||
| def events_from_list(lst): | ||
| def events_from_list(lst: Union[Iterator[StringEvent], Iterator[CollectionEvent]]) -> Iterator[CollectionEvent]: | ||
| """ | ||
| Yields events for all events in a list. | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This of the memory printing change should go into master as well (independent of the typing). IMHO.