-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
ijaniszewski
committed
Aug 27, 2020
1 parent
e9c6d01
commit 50470dd
Showing
11 changed files
with
221 additions
and
43 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,5 +2,7 @@ venv/ | |
.vscode/ | ||
__pycache__/ | ||
.cache/ | ||
data/ | ||
data/* | ||
!data/.keep | ||
all_data/ | ||
.DS_Store |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,3 @@ | ||
pandas==1.1.1 | ||
pylint==2.6.0 | ||
pytest==6.0.1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,57 +1,146 @@ | ||
"""Ingest Data and Get Info from it, based on passed movie ID.""" | ||
|
||
from dataclasses import dataclass, field | ||
from typing import List | ||
import glob | ||
import os | ||
import pandas as pd | ||
from typing import List | ||
import numpy as np | ||
|
||
|
||
from src.settings import comments_data, movies_data | ||
|
||
|
||
@dataclass | ||
class IngestData: | ||
"""Ingest data, create dataframes and concatenate in one df based on passed names.""" | ||
first_file: str | ||
files_pattern: str | ||
col_names: List[str] | ||
index_col: str | ||
all_files: List = field(default_factory=list) | ||
data_directory: str = "data" | ||
|
||
def create_paths_to_files(self): | ||
def chcek_if_file_exists(self): | ||
"""Check if the firs file with column names exists.""" | ||
if not os.path.isfile(self.first_file): | ||
raise FileNotFoundError(f"There is no such file: {self.first_file}") | ||
|
||
def create_paths_to_files(self) -> None: | ||
"""Create list of files to ingest as self parameters.""" | ||
self.first_file = (os.path.join(self.data_directory, self.first_file)) | ||
self.chcek_if_file_exists() | ||
self.all_files = glob.glob(os.path.join(self.data_directory, self.files_pattern)) | ||
|
||
def get_df(self): | ||
def get_df(self) -> pd.DataFrame: | ||
"""Create concatenated df from path to files. | ||
Return | ||
---------- | ||
concatenated_df: pd.DataFrame | ||
Concatenated dataframe from all of given csv files. | ||
""" | ||
self.create_paths_to_files() | ||
df_from_each_file = (pd.read_csv(f, names=self.col_names, index_col=self.index_col) if f != self.first_file else pd.read_csv(self.first_file, index_col=self.index_col) for f in self.all_files) | ||
df_from_each_file = ( | ||
pd.read_csv(f, names=self.col_names, index_col=self.index_col) | ||
if f != self.first_file | ||
else pd.read_csv(self.first_file, index_col=self.index_col) | ||
for f in self.all_files) | ||
concatenated_df = pd.concat(df_from_each_file) | ||
return concatenated_df | ||
|
||
|
||
class GetInfo: | ||
def get_dfs(self): | ||
"""Load dataframes and get info from it.""" | ||
@staticmethod | ||
def get_dfs() -> (pd.DataFrame, pd.DataFrame): | ||
"""Load dataframes via IngesData class. | ||
Return | ||
---------- | ||
comments_df: pd.DataFrame | ||
concatenated dataframe from all comments csv files | ||
movies_df: pd.DataFrame | ||
concatenated dataframe from all movies csv files | ||
""" | ||
com_id = IngestData(**comments_data) | ||
comments_df = com_id.get_df() | ||
mov_id = IngestData(**movies_data) | ||
movies_df = mov_id.get_df() | ||
return comments_df, movies_df | ||
|
||
def get_movie_title(self, movie_id, movies_df): | ||
@staticmethod | ||
def get_movie_title(movie_id: int, movies_df: pd.DataFrame) -> str: | ||
"""Return movie title based on given movie_id and dataframe. | ||
Parameters | ||
---------- | ||
movie_id: int | ||
movie_id as an integer to get it's movie ID | ||
movies_df: pd.DataFrame | ||
dataframe to be searched | ||
Return | ||
---------- | ||
title: str | ||
title of searched movie. If there is no such ID - return None | ||
""" | ||
try: | ||
title = movies_df.loc[movie_id]["title"] | ||
return title | ||
except KeyError: | ||
return None | ||
|
||
def get_movie_comments(self, movie_id, comments_df): | ||
@staticmethod | ||
def get_movie_comments(movie_id: int, comments_df: pd.DataFrame) -> int: | ||
"""Return number of comments based on given movie_id and dataframe. | ||
Parameters | ||
---------- | ||
movie_id: int | ||
movie_id as an integer to get it's number of comments | ||
movies_df: pd.DataFrame | ||
dataframe to be searched | ||
Return | ||
---------- | ||
comments_no: int | ||
comments number of searched movie. If there is no such ID - return 0 | ||
""" | ||
comments_no = comments_df[comments_df["id_movie"]==movie_id].sum()["id_movie"] | ||
return comments_no | ||
return int(comments_no) | ||
|
||
@staticmethod | ||
def check_correctness_of_data(dataframe, col_name, type_, df_name): | ||
"""Check if values in a column are as expected in a given dataframe.""" | ||
unique_values = dataframe[col_name].unique() | ||
correct = all(isinstance(x, type_) for x in list(unique_values)) | ||
if not correct: | ||
error_info = (f"There are other types than {type_} in dataframe " | ||
f"{df_name} in column {col_name}. Cannot process.") | ||
raise ValueError(error_info) | ||
|
||
|
||
def main(self, movie_id: int) -> None: | ||
"""Print title and comments number based on given movie ID. | ||
def main(self, movie_id): | ||
Parameters | ||
---------- | ||
movie_id: int | ||
movie_id as an integer to get it's title and number of comments | ||
""" | ||
comments_df, movies_df = self.get_dfs() | ||
self.check_correctness_of_data(comments_df, "id_movie", np.int64, "comments_df") | ||
self.check_correctness_of_data(movies_df, "id_game", np.int64, "movies_df") | ||
title = self.get_movie_title(movie_id, movies_df) | ||
comments_no = self.get_movie_comments(movie_id, comments_df) | ||
if title is None: | ||
print(f"There is no title for id movie you provided: {movie_id}") | ||
print(f"There is no title for movie ID you provided: {movie_id}") | ||
else: | ||
print(f"There is {comments_no} comments for the movie: {title} (movie ID: {movie_id}).") | ||
|
||
if comments_no == 0: | ||
comments_text = "are no comments" | ||
elif comments_no == 1: | ||
comments_text = "is one comment" | ||
else: | ||
comments_text = f"are {comments_no} comments" | ||
print(f"There {comments_text} for the movie: {title} (movie ID: {movie_id}).") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
# pylint: disable=C0103 | ||
|
||
"""Test to check if data in dataframes is correct.""" | ||
|
||
import pandas as pd | ||
import numpy as np | ||
import pytest | ||
|
||
from src.main import GetInfo | ||
from tests.test_get_info_from_df import create_movies_example_df | ||
|
||
|
||
def create_comments_df_with_wrong_data(): | ||
"""Create example comments df with wrong data.""" | ||
data = { | ||
"id_comment": list(range(6)), | ||
"user": [f"user{user_no}" for user_no in range(6)], | ||
"id_movie": [1 for no in range(5)] + ["bum"] | ||
} | ||
example_df = pd.DataFrame(data, columns = data.keys()).set_index("id_comment") | ||
return example_df | ||
|
||
|
||
def create_movies_df_with_wrong_data(): | ||
"""Create example movies df with wrong data.""" | ||
data = { | ||
"id_movie": list(range(6)), | ||
"title": [f"user{user_no}" for user_no in range(6)], | ||
"id_game": [1 for no in range(5)] + ["bum"] | ||
} | ||
example_df = pd.DataFrame(data, columns = data.keys()).set_index("id_movie") | ||
return example_df | ||
|
||
|
||
def assert_df_with_wrong_data(example_df, type_, df_name, col_name): | ||
"""Test check_correctness_of_data method.""" | ||
get_info = GetInfo() | ||
error_info = (f"There are other types than {type_} in dataframe " | ||
f"{df_name} in column {col_name}. Cannot process.") | ||
|
||
with pytest.raises(ValueError) as e: | ||
get_info.check_correctness_of_data(example_df, col_name, type_, df_name) | ||
assert str(e.value) == error_info | ||
|
||
|
||
def test_comments_df_with_wrong_data(): | ||
"""Test check_correctness_of_data method for comments df if data is not correct.""" | ||
example_df = create_comments_df_with_wrong_data() | ||
assert_df_with_wrong_data(example_df, np.int64, "comments_df", "id_movie") | ||
|
||
|
||
def test_movies_df_with_wrong_data(): | ||
"""Test check_correctness_of_data method for movies df if data is not correct.""" | ||
example_df = create_movies_df_with_wrong_data() | ||
assert_df_with_wrong_data(example_df, np.int64, "movies_df", "id_game") | ||
|
||
def test_movies_df_with_correct_data(): | ||
"""Test check_correctness_of_data method for movies df if data is correct.""" | ||
example_df = create_movies_example_df() | ||
get_info = GetInfo() | ||
correct = get_info.check_correctness_of_data(example_df, "id_game", np.int64, "movies_df") | ||
assert correct is None |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,12 +1,20 @@ | ||
import pandas as pd | ||
"""Test if IngestData concatenates data properly.""" | ||
|
||
from src.settings import comments_data, movies_data | ||
from src.main import IngestData | ||
|
||
|
||
|
||
def test_comments_concatenate(): | ||
"""Test if comments dataframe is concatenated by get_df method properly.""" | ||
com_id = IngestData(**comments_data) | ||
comments_df = com_id.get_df() | ||
assert comments_df.shape[0] > 1000 | ||
assert comments_df.shape[1] == 4 | ||
assert comments_df.shape[0] > 160000 | ||
assert comments_df.shape[1] == 2 | ||
|
||
|
||
def test_movies_concatenate(): | ||
"""Test if movies dataframe is concatenated by get_df method properly.""" | ||
com_id = IngestData(**movies_data) | ||
movies_df = com_id.get_df() | ||
assert movies_df.shape[0] > 200 | ||
assert movies_df.shape[1] == 2 |
Oops, something went wrong.