From 50470dd80ebe43598b8b2b920238c2d5ef9d11ad Mon Sep 17 00:00:00 2001
From: ijaniszewski <ignacy.janiszewski@intelilex.net>
Date: Thu, 27 Aug 2020 17:20:23 +0200
Subject: [PATCH] More tests done

---
 .gitignore                              |   4 +-
 Dockerfile                              |   5 +-
 README.md                               |  10 ++-
 data/.keep                              |   0
 requirements.txt                        |   2 +
 run_from_file.py                        |  10 ++-
 src/main.py                             | 113 +++++++++++++++++++++---
 src/settings.py                         |   6 +-
 tests/test_check_correctness_df_data.py |  62 +++++++++++++
 tests/test_concatenate_df.py            |  16 +++-
 tests/test_get_info_from_df.py          |  36 ++++----
 11 files changed, 221 insertions(+), 43 deletions(-)
 create mode 100644 data/.keep
 create mode 100644 tests/test_check_correctness_df_data.py

diff --git a/.gitignore b/.gitignore
index 7d5ad66..3bce2c7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,5 +2,7 @@ venv/
 .vscode/
 __pycache__/
 .cache/
-data/
+data/*
+!data/.keep
+all_data/
 .DS_Store
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
index 0707470..af871de 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -2,4 +2,7 @@ FROM python:3.7
 COPY . /app
 WORKDIR /app
 RUN pip install -r requirements.txt
-RUN python src/run_from_file.py -m 1
\ No newline at end of file
+RUN pytest
+RUN pylint src/ tests/ run_from_file
+RUN python run_from_file.py --movie_id 1
+RUN python run_from_file.py --movie_id 220
\ No newline at end of file
diff --git a/README.md b/README.md
index de7993b..fc1e627 100644
--- a/README.md
+++ b/README.md
@@ -2,18 +2,20 @@
 
 ## Example of usage:
 
-`python run_from_file.py -m <movie_id>`
+Please add data to data/ directory and run python script:
+
+`python run_from_file.py --movie_id <movie_id>`
 
 i.e.
 
-`python run_from_file.py -m 1`
+`python run_from_file.py --movie_id 220`
 
 or via Docker (just change the last line in Dockerfile)
 
-`RUN python run_from_file.py -m 1 `
+`RUN python run_from_file.py --movie_id 1 `
 
 ### Python Version
 
 `python -V`
 
-\$ Python 3.7.6
+$ Python 3.7.6
diff --git a/data/.keep b/data/.keep
new file mode 100644
index 0000000..e69de29
diff --git a/requirements.txt b/requirements.txt
index 9f98ab6..b514844 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1 +1,3 @@
 pandas==1.1.1
+pylint==2.6.0
+pytest==6.0.1
diff --git a/run_from_file.py b/run_from_file.py
index 9d9c651..96912b2 100644
--- a/run_from_file.py
+++ b/run_from_file.py
@@ -6,7 +6,9 @@
 
 
 class GetArgument:
-    def get_argument(self):
+    """Get passed movie ID and get info about it via GetInfo instance."""
+    @staticmethod
+    def get_argument():
         """Get the movie id from argument if it is integer.
 
         Return
@@ -27,10 +29,10 @@ def get_argument(self):
     def from_file(self):
         """Get the movie_id as passed argument and create GetInfo object."""
         movie_id = self.get_argument()
-        gi = GetInfo()
-        gi.main(movie_id)
+        get_info = GetInfo()
+        get_info.main(movie_id)
 
 
 if __name__ == "__main__":
     ga = GetArgument()
-    ga.from_file()
\ No newline at end of file
+    ga.from_file()
diff --git a/src/main.py b/src/main.py
index a775cb2..f0be68a 100644
--- a/src/main.py
+++ b/src/main.py
@@ -1,14 +1,19 @@
+"""Ingest Data and Get Info from it, based on passed movie ID."""
+
 from dataclasses import dataclass, field
+from typing import List
 import glob
 import os
 import pandas as pd
-from typing import List
+import numpy as np
+
 
 from src.settings import comments_data, movies_data
 
 
 @dataclass
 class IngestData:
+    """Ingest data, create dataframes and concatenate in one df based on passed names."""
     first_file: str
     files_pattern: str
     col_names: List[str]
@@ -16,42 +21,126 @@ class IngestData:
     all_files: List = field(default_factory=list)
     data_directory: str = "data"
 
-    def create_paths_to_files(self):
+    def chcek_if_file_exists(self):
+        """Check if the firs file with column names exists."""
+        if not os.path.isfile(self.first_file):
+            raise FileNotFoundError(f"There is no such file: {self.first_file}")
+
+    def create_paths_to_files(self) -> None:
+        """Create list of files to ingest as self parameters."""
         self.first_file = (os.path.join(self.data_directory, self.first_file))
+        self.chcek_if_file_exists()
         self.all_files = glob.glob(os.path.join(self.data_directory, self.files_pattern))
 
-    def get_df(self):
+    def get_df(self) -> pd.DataFrame:
+        """Create concatenated df from path to files.
+
+        Return
+        ----------
+        concatenated_df: pd.DataFrame
+            Concatenated dataframe from all of given csv files.
+        """
         self.create_paths_to_files()
-        df_from_each_file = (pd.read_csv(f, names=self.col_names, index_col=self.index_col) if f != self.first_file else pd.read_csv(self.first_file, index_col=self.index_col) for f in self.all_files)
+        df_from_each_file = (
+            pd.read_csv(f, names=self.col_names, index_col=self.index_col)
+            if f != self.first_file
+            else pd.read_csv(self.first_file, index_col=self.index_col)
+            for f in self.all_files)
         concatenated_df = pd.concat(df_from_each_file)
         return concatenated_df
 
 
 class GetInfo:
-    def get_dfs(self):
+    """Load dataframes and get info from it."""
+    @staticmethod
+    def get_dfs() -> (pd.DataFrame, pd.DataFrame):
+        """Load dataframes via IngesData class.
+
+        Return
+        ----------
+        comments_df: pd.DataFrame
+            concatenated dataframe from all comments csv files
+        movies_df: pd.DataFrame
+            concatenated dataframe from all movies csv files
+        """
         com_id = IngestData(**comments_data)
         comments_df = com_id.get_df()
         mov_id = IngestData(**movies_data)
         movies_df = mov_id.get_df()
         return comments_df, movies_df
 
-    def get_movie_title(self, movie_id, movies_df):
+    @staticmethod
+    def get_movie_title(movie_id: int, movies_df: pd.DataFrame) -> str:
+        """Return movie title based on given movie_id and dataframe.
+
+        Parameters
+        ----------
+        movie_id: int
+            movie_id as an integer to get it's movie ID
+        movies_df: pd.DataFrame
+            dataframe to be searched
+
+        Return
+        ----------
+        title: str
+            title of searched movie. If there is no such ID - return None
+        """
         try:
             title = movies_df.loc[movie_id]["title"]
             return title
         except KeyError:
             return None
 
-    def get_movie_comments(self, movie_id, comments_df):
+    @staticmethod
+    def get_movie_comments(movie_id: int, comments_df: pd.DataFrame) -> int:
+        """Return number of comments based on given movie_id and dataframe.
+
+        Parameters
+        ----------
+        movie_id: int
+            movie_id as an integer to get it's number of comments
+        movies_df: pd.DataFrame
+            dataframe to be searched
+
+        Return
+        ----------
+        comments_no: int
+            comments number of searched movie. If there is no such ID - return 0
+        """
         comments_no = comments_df[comments_df["id_movie"]==movie_id].sum()["id_movie"]
-        return comments_no
+        return int(comments_no)
+
+    @staticmethod
+    def check_correctness_of_data(dataframe, col_name, type_, df_name):
+        """Check if values in a column are as expected in a given dataframe."""
+        unique_values = dataframe[col_name].unique()
+        correct = all(isinstance(x, type_) for x in list(unique_values))
+        if not correct:
+            error_info = (f"There are other types than {type_} in dataframe "
+                          f"{df_name} in column {col_name}. Cannot process.")
+            raise ValueError(error_info)
+
+
+    def main(self, movie_id: int) -> None:
+        """Print title and comments number based on given movie ID.
 
-    def main(self, movie_id):
+        Parameters
+        ----------
+        movie_id: int
+            movie_id as an integer to get it's title and number of comments
+        """
         comments_df, movies_df = self.get_dfs()
+        self.check_correctness_of_data(comments_df, "id_movie", np.int64, "comments_df")
+        self.check_correctness_of_data(movies_df, "id_game", np.int64, "movies_df")
         title = self.get_movie_title(movie_id, movies_df)
         comments_no = self.get_movie_comments(movie_id, comments_df)
         if title is None:
-            print(f"There is no title for id movie you provided: {movie_id}")
+            print(f"There is no title for movie ID you provided: {movie_id}")
         else:
-            print(f"There is {comments_no} comments for the movie: {title} (movie ID: {movie_id}).")
-        
+            if comments_no == 0:
+                comments_text = "are no comments"
+            elif comments_no == 1:
+                comments_text = "is one comment"
+            else:
+                comments_text = f"are {comments_no} comments"
+            print(f"There {comments_text} for the movie: {title} (movie ID: {movie_id}).")
diff --git a/src/settings.py b/src/settings.py
index c119f63..a068a22 100644
--- a/src/settings.py
+++ b/src/settings.py
@@ -1,3 +1,5 @@
+"""Settings how to ingest data from files in data directory."""
+
 comments_data = {
     "first_file": "comments-00.csv",
     "files_pattern": "comments-*.csv",
@@ -15,7 +17,7 @@
     "col_names": [
         "id_movie",
         "title",
-        "ig_game"
+        "id_game"
         ],
     "index_col": "id_movie"
-}
\ No newline at end of file
+}
diff --git a/tests/test_check_correctness_df_data.py b/tests/test_check_correctness_df_data.py
new file mode 100644
index 0000000..a43d99a
--- /dev/null
+++ b/tests/test_check_correctness_df_data.py
@@ -0,0 +1,62 @@
+# pylint: disable=C0103
+
+"""Test to check if data in dataframes is correct."""
+
+import pandas as pd
+import numpy as np
+import pytest
+
+from src.main import GetInfo
+from tests.test_get_info_from_df import create_movies_example_df
+
+
+def create_comments_df_with_wrong_data():
+    """Create example comments df with wrong data."""
+    data = {
+        "id_comment": list(range(6)),
+        "user": [f"user{user_no}" for user_no in range(6)],
+        "id_movie": [1 for no in range(5)] + ["bum"]
+    }
+    example_df = pd.DataFrame(data, columns = data.keys()).set_index("id_comment")
+    return example_df
+
+
+def create_movies_df_with_wrong_data():
+    """Create example movies df with wrong data."""
+    data = {
+        "id_movie": list(range(6)),
+        "title": [f"user{user_no}" for user_no in range(6)],
+        "id_game": [1 for no in range(5)] + ["bum"]
+    }
+    example_df = pd.DataFrame(data, columns = data.keys()).set_index("id_movie")
+    return example_df
+
+
+def assert_df_with_wrong_data(example_df, type_, df_name, col_name):
+    """Test check_correctness_of_data method."""
+    get_info = GetInfo()
+    error_info = (f"There are other types than {type_} in dataframe "
+                  f"{df_name} in column {col_name}. Cannot process.")
+
+    with pytest.raises(ValueError) as e:
+        get_info.check_correctness_of_data(example_df, col_name, type_, df_name)
+    assert str(e.value) == error_info
+
+
+def test_comments_df_with_wrong_data():
+    """Test check_correctness_of_data method for comments df if data is not correct."""
+    example_df = create_comments_df_with_wrong_data()
+    assert_df_with_wrong_data(example_df, np.int64, "comments_df", "id_movie")
+
+
+def test_movies_df_with_wrong_data():
+    """Test check_correctness_of_data method for movies df if data is not correct."""
+    example_df = create_movies_df_with_wrong_data()
+    assert_df_with_wrong_data(example_df, np.int64, "movies_df", "id_game")
+
+def test_movies_df_with_correct_data():
+    """Test check_correctness_of_data method for movies df if data is correct."""
+    example_df = create_movies_example_df()
+    get_info = GetInfo()
+    correct = get_info.check_correctness_of_data(example_df, "id_game", np.int64, "movies_df")
+    assert correct is None
diff --git a/tests/test_concatenate_df.py b/tests/test_concatenate_df.py
index 40fd984..006045b 100644
--- a/tests/test_concatenate_df.py
+++ b/tests/test_concatenate_df.py
@@ -1,12 +1,20 @@
-import pandas as pd
+"""Test if IngestData concatenates data properly."""
 
 from src.settings import comments_data, movies_data
 from src.main import IngestData
 
 
-
 def test_comments_concatenate():
+    """Test if comments dataframe is concatenated by get_df method properly."""
     com_id = IngestData(**comments_data)
     comments_df = com_id.get_df()
-    assert comments_df.shape[0] > 1000
-    assert comments_df.shape[1] == 4
+    assert comments_df.shape[0] > 160000
+    assert comments_df.shape[1] == 2
+
+
+def test_movies_concatenate():
+    """Test if movies dataframe is concatenated by get_df method properly."""
+    com_id = IngestData(**movies_data)
+    movies_df = com_id.get_df()
+    assert movies_df.shape[0] > 200
+    assert movies_df.shape[1] == 2
diff --git a/tests/test_get_info_from_df.py b/tests/test_get_info_from_df.py
index b07f9a9..942334e 100644
--- a/tests/test_get_info_from_df.py
+++ b/tests/test_get_info_from_df.py
@@ -1,45 +1,51 @@
+"""Test if getting info from dataframes works properly."""
+
 import pandas as pd
 
 from src.main import GetInfo
 
 
 def create_comments_example_df():
+    """Create example df based on how comments CSV are structured."""
     data = {
-        "id_comment": [no for no in range(6)],
+        "id_comment": list(range(6)),
         "user": [f"user{user_no}" for user_no in range(6)],
         "id_movie": [1 for no in range(3)] + [2 for no in range(3)]
     }
-    df = pd.DataFrame(data, columns = data.keys()).set_index("id_comment")
-    return df
+    example_df = pd.DataFrame(data, columns = data.keys()).set_index("id_comment")
+    return example_df
 
 
 def create_movies_example_df():
+    """Create example df based on how movies CSV are structured."""
     data = {
-        "id_movie": [no for no in range(6)],
+        "id_movie": list(range(6)),
         "title": [f"movie_{movie}" for movie in range(6)],
-        "ig_game": [1 for no in range(3)] + [2 for no in range(3)]
+        "id_game": [1 for no in range(3)] + [2 for no in range(3)]
     }
-    df = pd.DataFrame(data, columns = data.keys()).set_index("id_movie")
-    return df
-
+    example_df = pd.DataFrame(data, columns = data.keys()).set_index("id_movie")
+    return example_df
 
 
 def test_get_movie_comments():
+    """Test get movie comments method based on example dataframe."""
     example_df = create_comments_example_df()
-    gi = GetInfo()
-    comments_no = gi.get_movie_comments(1, example_df)
+    get_info = GetInfo()
+    comments_no = get_info.get_movie_comments(1, example_df)
     assert comments_no == 3
 
 
 def test_get_movie_title():
+    """Test get movie title method based on example dataframe."""
     example_df = create_movies_example_df()
-    gi = GetInfo()
-    movie_title = gi.get_movie_title(1, example_df)
+    get_info = GetInfo()
+    movie_title = get_info.get_movie_title(1, example_df)
     assert movie_title == "movie_1"
 
 
 def test_get_movie_title_no_movie_id():
+    """Test get movie tile method based on example dataframe if there is no such ID."""
     example_df = create_movies_example_df()
-    gi = GetInfo()
-    movie_title = gi.get_movie_title('X', example_df)
-    assert movie_title == None
+    get_info = GetInfo()
+    movie_title = get_info.get_movie_title('X', example_df)
+    assert movie_title is None