From 50470dd80ebe43598b8b2b920238c2d5ef9d11ad Mon Sep 17 00:00:00 2001 From: ijaniszewski Date: Thu, 27 Aug 2020 17:20:23 +0200 Subject: [PATCH] More tests done --- .gitignore | 4 +- Dockerfile | 5 +- README.md | 10 ++- data/.keep | 0 requirements.txt | 2 + run_from_file.py | 10 ++- src/main.py | 113 +++++++++++++++++++++--- src/settings.py | 6 +- tests/test_check_correctness_df_data.py | 62 +++++++++++++ tests/test_concatenate_df.py | 16 +++- tests/test_get_info_from_df.py | 36 ++++---- 11 files changed, 221 insertions(+), 43 deletions(-) create mode 100644 data/.keep create mode 100644 tests/test_check_correctness_df_data.py diff --git a/.gitignore b/.gitignore index 7d5ad66..3bce2c7 100644 --- a/.gitignore +++ b/.gitignore @@ -2,5 +2,7 @@ venv/ .vscode/ __pycache__/ .cache/ -data/ +data/* +!data/.keep +all_data/ .DS_Store \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 0707470..af871de 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,4 +2,7 @@ FROM python:3.7 COPY . /app WORKDIR /app RUN pip install -r requirements.txt -RUN python src/run_from_file.py -m 1 \ No newline at end of file +RUN pytest +RUN pylint src/ tests/ run_from_file +RUN python run_from_file.py --movie_id 1 +RUN python run_from_file.py --movie_id 220 \ No newline at end of file diff --git a/README.md b/README.md index de7993b..fc1e627 100644 --- a/README.md +++ b/README.md @@ -2,18 +2,20 @@ ## Example of usage: -`python run_from_file.py -m ` +Please add data to data/ directory and run python script: + +`python run_from_file.py --movie_id ` i.e. -`python run_from_file.py -m 1` +`python run_from_file.py --movie_id 220` or via Docker (just change the last line in Dockerfile) -`RUN python run_from_file.py -m 1 ` +`RUN python run_from_file.py --movie_id 1 ` ### Python Version `python -V` -\$ Python 3.7.6 +$ Python 3.7.6 diff --git a/data/.keep b/data/.keep new file mode 100644 index 0000000..e69de29 diff --git a/requirements.txt b/requirements.txt index 9f98ab6..b514844 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,3 @@ pandas==1.1.1 +pylint==2.6.0 +pytest==6.0.1 diff --git a/run_from_file.py b/run_from_file.py index 9d9c651..96912b2 100644 --- a/run_from_file.py +++ b/run_from_file.py @@ -6,7 +6,9 @@ class GetArgument: - def get_argument(self): + """Get passed movie ID and get info about it via GetInfo instance.""" + @staticmethod + def get_argument(): """Get the movie id from argument if it is integer. Return @@ -27,10 +29,10 @@ def get_argument(self): def from_file(self): """Get the movie_id as passed argument and create GetInfo object.""" movie_id = self.get_argument() - gi = GetInfo() - gi.main(movie_id) + get_info = GetInfo() + get_info.main(movie_id) if __name__ == "__main__": ga = GetArgument() - ga.from_file() \ No newline at end of file + ga.from_file() diff --git a/src/main.py b/src/main.py index a775cb2..f0be68a 100644 --- a/src/main.py +++ b/src/main.py @@ -1,14 +1,19 @@ +"""Ingest Data and Get Info from it, based on passed movie ID.""" + from dataclasses import dataclass, field +from typing import List import glob import os import pandas as pd -from typing import List +import numpy as np + from src.settings import comments_data, movies_data @dataclass class IngestData: + """Ingest data, create dataframes and concatenate in one df based on passed names.""" first_file: str files_pattern: str col_names: List[str] @@ -16,42 +21,126 @@ class IngestData: all_files: List = field(default_factory=list) data_directory: str = "data" - def create_paths_to_files(self): + def chcek_if_file_exists(self): + """Check if the firs file with column names exists.""" + if not os.path.isfile(self.first_file): + raise FileNotFoundError(f"There is no such file: {self.first_file}") + + def create_paths_to_files(self) -> None: + """Create list of files to ingest as self parameters.""" self.first_file = (os.path.join(self.data_directory, self.first_file)) + self.chcek_if_file_exists() self.all_files = glob.glob(os.path.join(self.data_directory, self.files_pattern)) - def get_df(self): + def get_df(self) -> pd.DataFrame: + """Create concatenated df from path to files. + + Return + ---------- + concatenated_df: pd.DataFrame + Concatenated dataframe from all of given csv files. + """ self.create_paths_to_files() - df_from_each_file = (pd.read_csv(f, names=self.col_names, index_col=self.index_col) if f != self.first_file else pd.read_csv(self.first_file, index_col=self.index_col) for f in self.all_files) + df_from_each_file = ( + pd.read_csv(f, names=self.col_names, index_col=self.index_col) + if f != self.first_file + else pd.read_csv(self.first_file, index_col=self.index_col) + for f in self.all_files) concatenated_df = pd.concat(df_from_each_file) return concatenated_df class GetInfo: - def get_dfs(self): + """Load dataframes and get info from it.""" + @staticmethod + def get_dfs() -> (pd.DataFrame, pd.DataFrame): + """Load dataframes via IngesData class. + + Return + ---------- + comments_df: pd.DataFrame + concatenated dataframe from all comments csv files + movies_df: pd.DataFrame + concatenated dataframe from all movies csv files + """ com_id = IngestData(**comments_data) comments_df = com_id.get_df() mov_id = IngestData(**movies_data) movies_df = mov_id.get_df() return comments_df, movies_df - def get_movie_title(self, movie_id, movies_df): + @staticmethod + def get_movie_title(movie_id: int, movies_df: pd.DataFrame) -> str: + """Return movie title based on given movie_id and dataframe. + + Parameters + ---------- + movie_id: int + movie_id as an integer to get it's movie ID + movies_df: pd.DataFrame + dataframe to be searched + + Return + ---------- + title: str + title of searched movie. If there is no such ID - return None + """ try: title = movies_df.loc[movie_id]["title"] return title except KeyError: return None - def get_movie_comments(self, movie_id, comments_df): + @staticmethod + def get_movie_comments(movie_id: int, comments_df: pd.DataFrame) -> int: + """Return number of comments based on given movie_id and dataframe. + + Parameters + ---------- + movie_id: int + movie_id as an integer to get it's number of comments + movies_df: pd.DataFrame + dataframe to be searched + + Return + ---------- + comments_no: int + comments number of searched movie. If there is no such ID - return 0 + """ comments_no = comments_df[comments_df["id_movie"]==movie_id].sum()["id_movie"] - return comments_no + return int(comments_no) + + @staticmethod + def check_correctness_of_data(dataframe, col_name, type_, df_name): + """Check if values in a column are as expected in a given dataframe.""" + unique_values = dataframe[col_name].unique() + correct = all(isinstance(x, type_) for x in list(unique_values)) + if not correct: + error_info = (f"There are other types than {type_} in dataframe " + f"{df_name} in column {col_name}. Cannot process.") + raise ValueError(error_info) + + + def main(self, movie_id: int) -> None: + """Print title and comments number based on given movie ID. - def main(self, movie_id): + Parameters + ---------- + movie_id: int + movie_id as an integer to get it's title and number of comments + """ comments_df, movies_df = self.get_dfs() + self.check_correctness_of_data(comments_df, "id_movie", np.int64, "comments_df") + self.check_correctness_of_data(movies_df, "id_game", np.int64, "movies_df") title = self.get_movie_title(movie_id, movies_df) comments_no = self.get_movie_comments(movie_id, comments_df) if title is None: - print(f"There is no title for id movie you provided: {movie_id}") + print(f"There is no title for movie ID you provided: {movie_id}") else: - print(f"There is {comments_no} comments for the movie: {title} (movie ID: {movie_id}).") - + if comments_no == 0: + comments_text = "are no comments" + elif comments_no == 1: + comments_text = "is one comment" + else: + comments_text = f"are {comments_no} comments" + print(f"There {comments_text} for the movie: {title} (movie ID: {movie_id}).") diff --git a/src/settings.py b/src/settings.py index c119f63..a068a22 100644 --- a/src/settings.py +++ b/src/settings.py @@ -1,3 +1,5 @@ +"""Settings how to ingest data from files in data directory.""" + comments_data = { "first_file": "comments-00.csv", "files_pattern": "comments-*.csv", @@ -15,7 +17,7 @@ "col_names": [ "id_movie", "title", - "ig_game" + "id_game" ], "index_col": "id_movie" -} \ No newline at end of file +} diff --git a/tests/test_check_correctness_df_data.py b/tests/test_check_correctness_df_data.py new file mode 100644 index 0000000..a43d99a --- /dev/null +++ b/tests/test_check_correctness_df_data.py @@ -0,0 +1,62 @@ +# pylint: disable=C0103 + +"""Test to check if data in dataframes is correct.""" + +import pandas as pd +import numpy as np +import pytest + +from src.main import GetInfo +from tests.test_get_info_from_df import create_movies_example_df + + +def create_comments_df_with_wrong_data(): + """Create example comments df with wrong data.""" + data = { + "id_comment": list(range(6)), + "user": [f"user{user_no}" for user_no in range(6)], + "id_movie": [1 for no in range(5)] + ["bum"] + } + example_df = pd.DataFrame(data, columns = data.keys()).set_index("id_comment") + return example_df + + +def create_movies_df_with_wrong_data(): + """Create example movies df with wrong data.""" + data = { + "id_movie": list(range(6)), + "title": [f"user{user_no}" for user_no in range(6)], + "id_game": [1 for no in range(5)] + ["bum"] + } + example_df = pd.DataFrame(data, columns = data.keys()).set_index("id_movie") + return example_df + + +def assert_df_with_wrong_data(example_df, type_, df_name, col_name): + """Test check_correctness_of_data method.""" + get_info = GetInfo() + error_info = (f"There are other types than {type_} in dataframe " + f"{df_name} in column {col_name}. Cannot process.") + + with pytest.raises(ValueError) as e: + get_info.check_correctness_of_data(example_df, col_name, type_, df_name) + assert str(e.value) == error_info + + +def test_comments_df_with_wrong_data(): + """Test check_correctness_of_data method for comments df if data is not correct.""" + example_df = create_comments_df_with_wrong_data() + assert_df_with_wrong_data(example_df, np.int64, "comments_df", "id_movie") + + +def test_movies_df_with_wrong_data(): + """Test check_correctness_of_data method for movies df if data is not correct.""" + example_df = create_movies_df_with_wrong_data() + assert_df_with_wrong_data(example_df, np.int64, "movies_df", "id_game") + +def test_movies_df_with_correct_data(): + """Test check_correctness_of_data method for movies df if data is correct.""" + example_df = create_movies_example_df() + get_info = GetInfo() + correct = get_info.check_correctness_of_data(example_df, "id_game", np.int64, "movies_df") + assert correct is None diff --git a/tests/test_concatenate_df.py b/tests/test_concatenate_df.py index 40fd984..006045b 100644 --- a/tests/test_concatenate_df.py +++ b/tests/test_concatenate_df.py @@ -1,12 +1,20 @@ -import pandas as pd +"""Test if IngestData concatenates data properly.""" from src.settings import comments_data, movies_data from src.main import IngestData - def test_comments_concatenate(): + """Test if comments dataframe is concatenated by get_df method properly.""" com_id = IngestData(**comments_data) comments_df = com_id.get_df() - assert comments_df.shape[0] > 1000 - assert comments_df.shape[1] == 4 + assert comments_df.shape[0] > 160000 + assert comments_df.shape[1] == 2 + + +def test_movies_concatenate(): + """Test if movies dataframe is concatenated by get_df method properly.""" + com_id = IngestData(**movies_data) + movies_df = com_id.get_df() + assert movies_df.shape[0] > 200 + assert movies_df.shape[1] == 2 diff --git a/tests/test_get_info_from_df.py b/tests/test_get_info_from_df.py index b07f9a9..942334e 100644 --- a/tests/test_get_info_from_df.py +++ b/tests/test_get_info_from_df.py @@ -1,45 +1,51 @@ +"""Test if getting info from dataframes works properly.""" + import pandas as pd from src.main import GetInfo def create_comments_example_df(): + """Create example df based on how comments CSV are structured.""" data = { - "id_comment": [no for no in range(6)], + "id_comment": list(range(6)), "user": [f"user{user_no}" for user_no in range(6)], "id_movie": [1 for no in range(3)] + [2 for no in range(3)] } - df = pd.DataFrame(data, columns = data.keys()).set_index("id_comment") - return df + example_df = pd.DataFrame(data, columns = data.keys()).set_index("id_comment") + return example_df def create_movies_example_df(): + """Create example df based on how movies CSV are structured.""" data = { - "id_movie": [no for no in range(6)], + "id_movie": list(range(6)), "title": [f"movie_{movie}" for movie in range(6)], - "ig_game": [1 for no in range(3)] + [2 for no in range(3)] + "id_game": [1 for no in range(3)] + [2 for no in range(3)] } - df = pd.DataFrame(data, columns = data.keys()).set_index("id_movie") - return df - + example_df = pd.DataFrame(data, columns = data.keys()).set_index("id_movie") + return example_df def test_get_movie_comments(): + """Test get movie comments method based on example dataframe.""" example_df = create_comments_example_df() - gi = GetInfo() - comments_no = gi.get_movie_comments(1, example_df) + get_info = GetInfo() + comments_no = get_info.get_movie_comments(1, example_df) assert comments_no == 3 def test_get_movie_title(): + """Test get movie title method based on example dataframe.""" example_df = create_movies_example_df() - gi = GetInfo() - movie_title = gi.get_movie_title(1, example_df) + get_info = GetInfo() + movie_title = get_info.get_movie_title(1, example_df) assert movie_title == "movie_1" def test_get_movie_title_no_movie_id(): + """Test get movie tile method based on example dataframe if there is no such ID.""" example_df = create_movies_example_df() - gi = GetInfo() - movie_title = gi.get_movie_title('X', example_df) - assert movie_title == None + get_info = GetInfo() + movie_title = get_info.get_movie_title('X', example_df) + assert movie_title is None