Skip to content

Commit

Permalink
More tests done
Browse files Browse the repository at this point in the history
  • Loading branch information
ijaniszewski committed Aug 27, 2020
1 parent e9c6d01 commit 50470dd
Show file tree
Hide file tree
Showing 11 changed files with 221 additions and 43 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,7 @@ venv/
.vscode/
__pycache__/
.cache/
data/
data/*
!data/.keep
all_data/
.DS_Store
5 changes: 4 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,7 @@ FROM python:3.7
COPY . /app
WORKDIR /app
RUN pip install -r requirements.txt
RUN python src/run_from_file.py -m 1
RUN pytest
RUN pylint src/ tests/ run_from_file
RUN python run_from_file.py --movie_id 1
RUN python run_from_file.py --movie_id 220
10 changes: 6 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,20 @@

## Example of usage:

`python run_from_file.py -m <movie_id>`
Please add data to data/ directory and run python script:

`python run_from_file.py --movie_id <movie_id>`

i.e.

`python run_from_file.py -m 1`
`python run_from_file.py --movie_id 220`

or via Docker (just change the last line in Dockerfile)

`RUN python run_from_file.py -m 1 `
`RUN python run_from_file.py --movie_id 1 `

### Python Version

`python -V`

\$ Python 3.7.6
$ Python 3.7.6
Empty file added data/.keep
Empty file.
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
pandas==1.1.1
pylint==2.6.0
pytest==6.0.1
10 changes: 6 additions & 4 deletions run_from_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@


class GetArgument:
def get_argument(self):
"""Get passed movie ID and get info about it via GetInfo instance."""
@staticmethod
def get_argument():
"""Get the movie id from argument if it is integer.
Return
Expand All @@ -27,10 +29,10 @@ def get_argument(self):
def from_file(self):
"""Get the movie_id as passed argument and create GetInfo object."""
movie_id = self.get_argument()
gi = GetInfo()
gi.main(movie_id)
get_info = GetInfo()
get_info.main(movie_id)


if __name__ == "__main__":
ga = GetArgument()
ga.from_file()
ga.from_file()
113 changes: 101 additions & 12 deletions src/main.py
Original file line number Diff line number Diff line change
@@ -1,57 +1,146 @@
"""Ingest Data and Get Info from it, based on passed movie ID."""

from dataclasses import dataclass, field
from typing import List
import glob
import os
import pandas as pd
from typing import List
import numpy as np


from src.settings import comments_data, movies_data


@dataclass
class IngestData:
"""Ingest data, create dataframes and concatenate in one df based on passed names."""
first_file: str
files_pattern: str
col_names: List[str]
index_col: str
all_files: List = field(default_factory=list)
data_directory: str = "data"

def create_paths_to_files(self):
def chcek_if_file_exists(self):
"""Check if the firs file with column names exists."""
if not os.path.isfile(self.first_file):
raise FileNotFoundError(f"There is no such file: {self.first_file}")

def create_paths_to_files(self) -> None:
"""Create list of files to ingest as self parameters."""
self.first_file = (os.path.join(self.data_directory, self.first_file))
self.chcek_if_file_exists()
self.all_files = glob.glob(os.path.join(self.data_directory, self.files_pattern))

def get_df(self):
def get_df(self) -> pd.DataFrame:
"""Create concatenated df from path to files.
Return
----------
concatenated_df: pd.DataFrame
Concatenated dataframe from all of given csv files.
"""
self.create_paths_to_files()
df_from_each_file = (pd.read_csv(f, names=self.col_names, index_col=self.index_col) if f != self.first_file else pd.read_csv(self.first_file, index_col=self.index_col) for f in self.all_files)
df_from_each_file = (
pd.read_csv(f, names=self.col_names, index_col=self.index_col)
if f != self.first_file
else pd.read_csv(self.first_file, index_col=self.index_col)
for f in self.all_files)
concatenated_df = pd.concat(df_from_each_file)
return concatenated_df


class GetInfo:
def get_dfs(self):
"""Load dataframes and get info from it."""
@staticmethod
def get_dfs() -> (pd.DataFrame, pd.DataFrame):
"""Load dataframes via IngesData class.
Return
----------
comments_df: pd.DataFrame
concatenated dataframe from all comments csv files
movies_df: pd.DataFrame
concatenated dataframe from all movies csv files
"""
com_id = IngestData(**comments_data)
comments_df = com_id.get_df()
mov_id = IngestData(**movies_data)
movies_df = mov_id.get_df()
return comments_df, movies_df

def get_movie_title(self, movie_id, movies_df):
@staticmethod
def get_movie_title(movie_id: int, movies_df: pd.DataFrame) -> str:
"""Return movie title based on given movie_id and dataframe.
Parameters
----------
movie_id: int
movie_id as an integer to get it's movie ID
movies_df: pd.DataFrame
dataframe to be searched
Return
----------
title: str
title of searched movie. If there is no such ID - return None
"""
try:
title = movies_df.loc[movie_id]["title"]
return title
except KeyError:
return None

def get_movie_comments(self, movie_id, comments_df):
@staticmethod
def get_movie_comments(movie_id: int, comments_df: pd.DataFrame) -> int:
"""Return number of comments based on given movie_id and dataframe.
Parameters
----------
movie_id: int
movie_id as an integer to get it's number of comments
movies_df: pd.DataFrame
dataframe to be searched
Return
----------
comments_no: int
comments number of searched movie. If there is no such ID - return 0
"""
comments_no = comments_df[comments_df["id_movie"]==movie_id].sum()["id_movie"]
return comments_no
return int(comments_no)

@staticmethod
def check_correctness_of_data(dataframe, col_name, type_, df_name):
"""Check if values in a column are as expected in a given dataframe."""
unique_values = dataframe[col_name].unique()
correct = all(isinstance(x, type_) for x in list(unique_values))
if not correct:
error_info = (f"There are other types than {type_} in dataframe "
f"{df_name} in column {col_name}. Cannot process.")
raise ValueError(error_info)


def main(self, movie_id: int) -> None:
"""Print title and comments number based on given movie ID.
def main(self, movie_id):
Parameters
----------
movie_id: int
movie_id as an integer to get it's title and number of comments
"""
comments_df, movies_df = self.get_dfs()
self.check_correctness_of_data(comments_df, "id_movie", np.int64, "comments_df")
self.check_correctness_of_data(movies_df, "id_game", np.int64, "movies_df")
title = self.get_movie_title(movie_id, movies_df)
comments_no = self.get_movie_comments(movie_id, comments_df)
if title is None:
print(f"There is no title for id movie you provided: {movie_id}")
print(f"There is no title for movie ID you provided: {movie_id}")
else:
print(f"There is {comments_no} comments for the movie: {title} (movie ID: {movie_id}).")

if comments_no == 0:
comments_text = "are no comments"
elif comments_no == 1:
comments_text = "is one comment"
else:
comments_text = f"are {comments_no} comments"
print(f"There {comments_text} for the movie: {title} (movie ID: {movie_id}).")
6 changes: 4 additions & 2 deletions src/settings.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
"""Settings how to ingest data from files in data directory."""

comments_data = {
"first_file": "comments-00.csv",
"files_pattern": "comments-*.csv",
Expand All @@ -15,7 +17,7 @@
"col_names": [
"id_movie",
"title",
"ig_game"
"id_game"
],
"index_col": "id_movie"
}
}
62 changes: 62 additions & 0 deletions tests/test_check_correctness_df_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# pylint: disable=C0103

"""Test to check if data in dataframes is correct."""

import pandas as pd
import numpy as np
import pytest

from src.main import GetInfo
from tests.test_get_info_from_df import create_movies_example_df


def create_comments_df_with_wrong_data():
"""Create example comments df with wrong data."""
data = {
"id_comment": list(range(6)),
"user": [f"user{user_no}" for user_no in range(6)],
"id_movie": [1 for no in range(5)] + ["bum"]
}
example_df = pd.DataFrame(data, columns = data.keys()).set_index("id_comment")
return example_df


def create_movies_df_with_wrong_data():
"""Create example movies df with wrong data."""
data = {
"id_movie": list(range(6)),
"title": [f"user{user_no}" for user_no in range(6)],
"id_game": [1 for no in range(5)] + ["bum"]
}
example_df = pd.DataFrame(data, columns = data.keys()).set_index("id_movie")
return example_df


def assert_df_with_wrong_data(example_df, type_, df_name, col_name):
"""Test check_correctness_of_data method."""
get_info = GetInfo()
error_info = (f"There are other types than {type_} in dataframe "
f"{df_name} in column {col_name}. Cannot process.")

with pytest.raises(ValueError) as e:
get_info.check_correctness_of_data(example_df, col_name, type_, df_name)
assert str(e.value) == error_info


def test_comments_df_with_wrong_data():
"""Test check_correctness_of_data method for comments df if data is not correct."""
example_df = create_comments_df_with_wrong_data()
assert_df_with_wrong_data(example_df, np.int64, "comments_df", "id_movie")


def test_movies_df_with_wrong_data():
"""Test check_correctness_of_data method for movies df if data is not correct."""
example_df = create_movies_df_with_wrong_data()
assert_df_with_wrong_data(example_df, np.int64, "movies_df", "id_game")

def test_movies_df_with_correct_data():
"""Test check_correctness_of_data method for movies df if data is correct."""
example_df = create_movies_example_df()
get_info = GetInfo()
correct = get_info.check_correctness_of_data(example_df, "id_game", np.int64, "movies_df")
assert correct is None
16 changes: 12 additions & 4 deletions tests/test_concatenate_df.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,20 @@
import pandas as pd
"""Test if IngestData concatenates data properly."""

from src.settings import comments_data, movies_data
from src.main import IngestData



def test_comments_concatenate():
"""Test if comments dataframe is concatenated by get_df method properly."""
com_id = IngestData(**comments_data)
comments_df = com_id.get_df()
assert comments_df.shape[0] > 1000
assert comments_df.shape[1] == 4
assert comments_df.shape[0] > 160000
assert comments_df.shape[1] == 2


def test_movies_concatenate():
"""Test if movies dataframe is concatenated by get_df method properly."""
com_id = IngestData(**movies_data)
movies_df = com_id.get_df()
assert movies_df.shape[0] > 200
assert movies_df.shape[1] == 2
Loading

0 comments on commit 50470dd

Please sign in to comment.