From 750b8e6294a0acb150f066d544f3a8744f3c7dc0 Mon Sep 17 00:00:00 2001 From: NilsHMeier <113688185+ErasonMeier@users.noreply.github.com> Date: Mon, 30 Jan 2023 15:19:48 +0100 Subject: [PATCH 1/3] Add methods to load tabular like data - Update __init__.py --- DataHandling/loading.py | 113 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 113 insertions(+) create mode 100644 DataHandling/loading.py diff --git a/DataHandling/loading.py b/DataHandling/loading.py new file mode 100644 index 0000000..75c83ee --- /dev/null +++ b/DataHandling/loading.py @@ -0,0 +1,113 @@ +import itertools +from pathlib import Path +from typing import Union, Optional +import pandas as pd + +ENCODINGS = ['utf-8', 'latin-1', 'iso-8859-1', 'cp1252', 'ascii'] +SEPARATORS = [',', ';', '\t', '|'] +DECIMALS = [',', '.'] + + +def load_tabular_like_file(path: Union[str, Path], encoding: str = None, separator: str = None, + decimal: str = None, index_col: Union[str, int] = None) -> pd.DataFrame: + """ + Loads the data from the given path into a Pandas DataFrame. + + The function automatically detects the file format and loads the data accordingly. If the file format is not + supported, the function will raise a ValueError. Furthermore, corrupted files will be solved as far as possible. + + :param path: Path to the data file as string or Path object. + :param encoding: Encoding of the data file. If None, the function will try to detect the encoding. + :param separator: Separator of the data file. If None, the function will try to detect the separator. + :param decimal: Decimal separator of the data file. If None, the function will try to detect the decimal separator. + :param index_col: Column to use as index for the DataFrame. If None, no column will be used as index. + :raise: ValueError if the file format is not supported. + :raise: FileNotFoundError if the file does not exist. + :return: Pandas DataFrame with the loaded data. + """ + # Check if the given path is a string or a Path object + if not isinstance(path, Path): + path = Path(path) + + # Check if the given path exists + if not path.exists(): + raise FileNotFoundError(f'The given path {path} does not exist.') + + # Check if the given path is a file + if not path.is_file(): + raise ValueError(f'The given path {path} is not a file.') + + # Check if the given path is a csv file + if path.suffix.lower() in ['.csv', '.txt']: + data = load_csv_like_file(path=path, encoding=encoding, separator=separator, + decimal=decimal, index_col=index_col) + if data is None: + raise ValueError(f'The given file {path} is not a valid csv or txt file.') + return data + + # Check if the given path is an Excel file + elif path.suffix.lower() in ['.xlsx', '.xls', '.xlsm']: + data = load_excel_file(path=path, index_col=index_col) + if data is None: + raise ValueError(f'The given file {path} is not a valid excel file.') + return data + + # Raise an error if the file format is not supported + else: + raise ValueError(f'The given file format {path.suffix} is currently not supported.') + + +def load_csv_like_file(path: Path, encoding: str = None, separator: str = None, + decimal: str = None, index_col: Union[str, int] = None) -> Optional[pd.DataFrame]: + """ + Loads a csv or txt file into a Pandas DataFrame. + + The function automatically detects the encoding, separator and decimal separator of the file. If the file could not + be loaded, the function will return None. + + :param path: Path to the data file as string or Path object. + :param encoding: Encoding of the data file. If None, the function will try to detect the encoding. + :param separator: Separator of the data file. If None, the function will try to detect the separator. + :param decimal: Decimal separator of the data file. If None, the function will try to detect the decimal separator. + :param index_col: Column to use as index for the DataFrame. If None, no column will be used as index. + :return: Pandas DataFrame with the loaded data or None if the file could not be loaded. + """ + # Initialize an empty DataFrame + data = None + + # Use the given parameters to load the data + try: + data = pd.read_csv(path, encoding=encoding, sep=separator, decimal=decimal, index_col=index_col) + return data + except Exception: + pass + + # If the loading fails, try to detect the encoding, separator and decimal separator + for enc, sep, dec in itertools.product(ENCODINGS, SEPARATORS, DECIMALS): + try: + data = pd.read_csv(path, encoding=enc, sep=sep, decimal=dec) + return data + except Exception: + pass + + return data + + +def load_excel_file(path: Path, index_col: Union[str, int] = None) -> Optional[pd.DataFrame]: + """ + Loads an Excel file into a Pandas DataFrame. + + :param path: Path to the data file as string or Path object. + :param index_col: Column to use as index for the DataFrame. If None, no column will be used as index. + :return: Pandas DataFrame with the loaded data or None if the file could not be loaded. + """ + # Initialize an empty DataFrame + data = None + + # Use the given parameters to load the data + try: + data = pd.read_excel(path, index_col=index_col) + except Exception: + pass + + return data From 5038c5ea6c6ddb91a30ecccfce98f3760775ad19 Mon Sep 17 00:00:00 2001 From: NilsHMeier <113688185+ErasonMeier@users.noreply.github.com> Date: Mon, 30 Jan 2023 15:37:09 +0100 Subject: [PATCH 2/3] - Update __init__.py --- DataHandling/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/DataHandling/__init__.py b/DataHandling/__init__.py index 8021c44..4fb78b3 100644 --- a/DataHandling/__init__.py +++ b/DataHandling/__init__.py @@ -1 +1,2 @@ from .converting import convert_to_numerical, convert_to_datetime, convert_to_boolean +from .loading import load_tabular_like_file, load_csv_like_file, load_excel_file From ce1e2911c06831fcbf7753fbe1e3821f2b9860b3 Mon Sep 17 00:00:00 2001 From: NilsHMeier <113688185+ErasonMeier@users.noreply.github.com> Date: Sun, 5 Feb 2023 16:38:53 +0100 Subject: [PATCH 3/3] Add testcases for data loading - Fix error due to separator value set to None --- DataHandling/loading.py | 4 +++- Tests/test_loading.py | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 1 deletion(-) create mode 100644 Tests/test_loading.py diff --git a/DataHandling/loading.py b/DataHandling/loading.py index 75c83ee..affef5e 100644 --- a/DataHandling/loading.py +++ b/DataHandling/loading.py @@ -77,7 +77,9 @@ def load_csv_like_file(path: Path, encoding: str = None, separator: str = None, # Use the given parameters to load the data try: - data = pd.read_csv(path, encoding=encoding, sep=separator, decimal=decimal, index_col=index_col) + data = pd.read_csv(path, encoding=encoding, sep=separator, + decimal=decimal if decimal is not None else '.', + index_col=index_col) return data except Exception: pass diff --git a/Tests/test_loading.py b/Tests/test_loading.py new file mode 100644 index 0000000..3258aa2 --- /dev/null +++ b/Tests/test_loading.py @@ -0,0 +1,33 @@ +import unittest +import pandas as pd +from pathlib import Path +from DataHandling import load_tabular_like_file + + +class TestLoading(unittest.TestCase): + def test_load_covid_data(self): + result = load_tabular_like_file('../data/Covid_Data.csv') + self.assertIsInstance(result, pd.DataFrame) + self.assertEqual(result.shape, (52429, 21)) + + def test_load_cycling_data(self): + result = load_tabular_like_file('../data/Cycling_Data.csv') + self.assertIsInstance(result, pd.DataFrame) + self.assertEqual(result.shape, (624, 28)) + + def test_load_passenger_data(self): + result = load_tabular_like_file('../data/Passenger_Stats.csv') + self.assertIsInstance(result, pd.DataFrame) + self.assertEqual(result.shape, (15007, 17)) + + def test_load_song_data(self): + path = Path('../data/Radio_Songs.csv') + result = load_tabular_like_file(path) + self.assertIsInstance(result, pd.DataFrame) + self.assertEqual(result.shape, (2918, 4)) + + def test_load_wine_data(self): + path = Path('../data/Wine_Malformed.csv') + result = load_tabular_like_file(path, separator=';') + self.assertIsInstance(result, pd.DataFrame) + self.assertEqual(result.shape, (4898, 5))