More tests done

ijaniszewski · Aug 27, 2020 · 50470dd · 50470dd
1 parent e9c6d01
commit 50470dd
Show file tree

Hide file tree

Showing 11 changed files with 221 additions and 43 deletions.
diff --git a/.gitignore b/.gitignore
@@ -2,5 +2,7 @@ venv/
 .vscode/
 __pycache__/
 .cache/
-data/
+data/*
+!data/.keep
+all_data/
 .DS_Store
diff --git a/Dockerfile b/Dockerfile
@@ -2,4 +2,7 @@ FROM python:3.7
 COPY . /app
 WORKDIR /app
 RUN pip install -r requirements.txt
-RUN python src/run_from_file.py -m 1
+RUN pytest
+RUN pylint src/ tests/ run_from_file
+RUN python run_from_file.py --movie_id 1
+RUN python run_from_file.py --movie_id 220
diff --git a/README.md b/README.md
@@ -2,18 +2,20 @@
 
 ## Example of usage:
 
-`python run_from_file.py -m <movie_id>`
+Please add data to data/ directory and run python script:
+
+`python run_from_file.py --movie_id <movie_id>`
 
 i.e.
 
-`python run_from_file.py -m 1`
+`python run_from_file.py --movie_id 220`
 
 or via Docker (just change the last line in Dockerfile)
 
-`RUN python run_from_file.py -m 1 `
+`RUN python run_from_file.py --movie_id 1 `
 
 ### Python Version
 
 `python -V`
 
-\$ Python 3.7.6
+$ Python 3.7.6
diff --git a/data/.keep b/data/.keep
diff --git a/requirements.txt b/requirements.txt
@@ -1 +1,3 @@
 pandas==1.1.1
+pylint==2.6.0
+pytest==6.0.1
diff --git a/run_from_file.py b/run_from_file.py
@@ -6,7 +6,9 @@
 
 
 class GetArgument:
-    def get_argument(self):
+    """Get passed movie ID and get info about it via GetInfo instance."""
+    @staticmethod
+    def get_argument():
         """Get the movie id from argument if it is integer.
 
         Return
@@ -27,10 +29,10 @@ def get_argument(self):
     def from_file(self):
         """Get the movie_id as passed argument and create GetInfo object."""
         movie_id = self.get_argument()
-        gi = GetInfo()
-        gi.main(movie_id)
+        get_info = GetInfo()
+        get_info.main(movie_id)
 
 
 if __name__ == "__main__":
     ga = GetArgument()
-    ga.from_file()
+    ga.from_file()
diff --git a/src/main.py b/src/main.py
@@ -1,57 +1,146 @@
+"""Ingest Data and Get Info from it, based on passed movie ID."""
+
 from dataclasses import dataclass, field
+from typing import List
 import glob
 import os
 import pandas as pd
-from typing import List
+import numpy as np
+
 
 from src.settings import comments_data, movies_data
 
 
 @dataclass
 class IngestData:
+    """Ingest data, create dataframes and concatenate in one df based on passed names."""
     first_file: str
     files_pattern: str
     col_names: List[str]
     index_col: str
     all_files: List = field(default_factory=list)
     data_directory: str = "data"
 
-    def create_paths_to_files(self):
+    def chcek_if_file_exists(self):
+        """Check if the firs file with column names exists."""
+        if not os.path.isfile(self.first_file):
+            raise FileNotFoundError(f"There is no such file: {self.first_file}")
+
+    def create_paths_to_files(self) -> None:
+        """Create list of files to ingest as self parameters."""
         self.first_file = (os.path.join(self.data_directory, self.first_file))
+        self.chcek_if_file_exists()
         self.all_files = glob.glob(os.path.join(self.data_directory, self.files_pattern))
 
-    def get_df(self):
+    def get_df(self) -> pd.DataFrame:
+        """Create concatenated df from path to files.
+
+        Return
+        ----------
+        concatenated_df: pd.DataFrame
+            Concatenated dataframe from all of given csv files.
+        """
         self.create_paths_to_files()
-        df_from_each_file = (pd.read_csv(f, names=self.col_names, index_col=self.index_col) if f != self.first_file else pd.read_csv(self.first_file, index_col=self.index_col) for f in self.all_files)
+        df_from_each_file = (
+            pd.read_csv(f, names=self.col_names, index_col=self.index_col)
+            if f != self.first_file
+            else pd.read_csv(self.first_file, index_col=self.index_col)
+            for f in self.all_files)
         concatenated_df = pd.concat(df_from_each_file)
         return concatenated_df
 
 
 class GetInfo:
-    def get_dfs(self):
+    """Load dataframes and get info from it."""
+    @staticmethod
+    def get_dfs() -> (pd.DataFrame, pd.DataFrame):
+        """Load dataframes via IngesData class.
+
+        Return
+        ----------
+        comments_df: pd.DataFrame
+            concatenated dataframe from all comments csv files
+        movies_df: pd.DataFrame
+            concatenated dataframe from all movies csv files
+        """
         com_id = IngestData(**comments_data)
         comments_df = com_id.get_df()
         mov_id = IngestData(**movies_data)
         movies_df = mov_id.get_df()
         return comments_df, movies_df
 
-    def get_movie_title(self, movie_id, movies_df):
+    @staticmethod
+    def get_movie_title(movie_id: int, movies_df: pd.DataFrame) -> str:
+        """Return movie title based on given movie_id and dataframe.
+
+        Parameters
+        ----------
+        movie_id: int
+            movie_id as an integer to get it's movie ID
+        movies_df: pd.DataFrame
+            dataframe to be searched
+
+        Return
+        ----------
+        title: str
+            title of searched movie. If there is no such ID - return None
+        """
         try:
             title = movies_df.loc[movie_id]["title"]
             return title
         except KeyError:
             return None
 
-    def get_movie_comments(self, movie_id, comments_df):
+    @staticmethod
+    def get_movie_comments(movie_id: int, comments_df: pd.DataFrame) -> int:
+        """Return number of comments based on given movie_id and dataframe.
+
+        Parameters
+        ----------
+        movie_id: int
+            movie_id as an integer to get it's number of comments
+        movies_df: pd.DataFrame
+            dataframe to be searched
+
+        Return
+        ----------
+        comments_no: int
+            comments number of searched movie. If there is no such ID - return 0
+        """
         comments_no = comments_df[comments_df["id_movie"]==movie_id].sum()["id_movie"]
-        return comments_no
+        return int(comments_no)
+
+    @staticmethod
+    def check_correctness_of_data(dataframe, col_name, type_, df_name):
+        """Check if values in a column are as expected in a given dataframe."""
+        unique_values = dataframe[col_name].unique()
+        correct = all(isinstance(x, type_) for x in list(unique_values))
+        if not correct:
+            error_info = (f"There are other types than {type_} in dataframe "
+                          f"{df_name} in column {col_name}. Cannot process.")
+            raise ValueError(error_info)
+
+
+    def main(self, movie_id: int) -> None:
+        """Print title and comments number based on given movie ID.
 
-    def main(self, movie_id):
+        Parameters
+        ----------
+        movie_id: int
+            movie_id as an integer to get it's title and number of comments
+        """
         comments_df, movies_df = self.get_dfs()
+        self.check_correctness_of_data(comments_df, "id_movie", np.int64, "comments_df")
+        self.check_correctness_of_data(movies_df, "id_game", np.int64, "movies_df")
         title = self.get_movie_title(movie_id, movies_df)
         comments_no = self.get_movie_comments(movie_id, comments_df)
         if title is None:
-            print(f"There is no title for id movie you provided: {movie_id}")
+            print(f"There is no title for movie ID you provided: {movie_id}")
         else:
-            print(f"There is {comments_no} comments for the movie: {title} (movie ID: {movie_id}).")
-
+            if comments_no == 0:
+                comments_text = "are no comments"
+            elif comments_no == 1:
+                comments_text = "is one comment"
+            else:
+                comments_text = f"are {comments_no} comments"
+            print(f"There {comments_text} for the movie: {title} (movie ID: {movie_id}).")
diff --git a/src/settings.py b/src/settings.py
@@ -1,3 +1,5 @@
+"""Settings how to ingest data from files in data directory."""
+
 comments_data = {
     "first_file": "comments-00.csv",
     "files_pattern": "comments-*.csv",
@@ -15,7 +17,7 @@
     "col_names": [
         "id_movie",
         "title",
-        "ig_game"
+        "id_game"
         ],
     "index_col": "id_movie"
-}
+}
diff --git a/tests/test_check_correctness_df_data.py b/tests/test_check_correctness_df_data.py
@@ -0,0 +1,62 @@
+# pylint: disable=C0103
+
+"""Test to check if data in dataframes is correct."""
+
+import pandas as pd
+import numpy as np
+import pytest
+
+from src.main import GetInfo
+from tests.test_get_info_from_df import create_movies_example_df
+
+
+def create_comments_df_with_wrong_data():
+    """Create example comments df with wrong data."""
+    data = {
+        "id_comment": list(range(6)),
+        "user": [f"user{user_no}" for user_no in range(6)],
+        "id_movie": [1 for no in range(5)] + ["bum"]
+    }
+    example_df = pd.DataFrame(data, columns = data.keys()).set_index("id_comment")
+    return example_df
+
+
+def create_movies_df_with_wrong_data():
+    """Create example movies df with wrong data."""
+    data = {
+        "id_movie": list(range(6)),
+        "title": [f"user{user_no}" for user_no in range(6)],
+        "id_game": [1 for no in range(5)] + ["bum"]
+    }
+    example_df = pd.DataFrame(data, columns = data.keys()).set_index("id_movie")
+    return example_df
+
+
+def assert_df_with_wrong_data(example_df, type_, df_name, col_name):
+    """Test check_correctness_of_data method."""
+    get_info = GetInfo()
+    error_info = (f"There are other types than {type_} in dataframe "
+                  f"{df_name} in column {col_name}. Cannot process.")
+
+    with pytest.raises(ValueError) as e:
+        get_info.check_correctness_of_data(example_df, col_name, type_, df_name)
+    assert str(e.value) == error_info
+
+
+def test_comments_df_with_wrong_data():
+    """Test check_correctness_of_data method for comments df if data is not correct."""
+    example_df = create_comments_df_with_wrong_data()
+    assert_df_with_wrong_data(example_df, np.int64, "comments_df", "id_movie")
+
+
+def test_movies_df_with_wrong_data():
+    """Test check_correctness_of_data method for movies df if data is not correct."""
+    example_df = create_movies_df_with_wrong_data()
+    assert_df_with_wrong_data(example_df, np.int64, "movies_df", "id_game")
+
+def test_movies_df_with_correct_data():
+    """Test check_correctness_of_data method for movies df if data is correct."""
+    example_df = create_movies_example_df()
+    get_info = GetInfo()
+    correct = get_info.check_correctness_of_data(example_df, "id_game", np.int64, "movies_df")
+    assert correct is None
diff --git a/tests/test_concatenate_df.py b/tests/test_concatenate_df.py
@@ -1,12 +1,20 @@
-import pandas as pd
+"""Test if IngestData concatenates data properly."""
 
 from src.settings import comments_data, movies_data
 from src.main import IngestData
 
 
-
 def test_comments_concatenate():
+    """Test if comments dataframe is concatenated by get_df method properly."""
     com_id = IngestData(**comments_data)
     comments_df = com_id.get_df()
-    assert comments_df.shape[0] > 1000
-    assert comments_df.shape[1] == 4
+    assert comments_df.shape[0] > 160000
+    assert comments_df.shape[1] == 2
+
+
+def test_movies_concatenate():
+    """Test if movies dataframe is concatenated by get_df method properly."""
+    com_id = IngestData(**movies_data)
+    movies_df = com_id.get_df()
+    assert movies_df.shape[0] > 200
+    assert movies_df.shape[1] == 2