diff --git a/DataHandling/__init__.py b/DataHandling/__init__.py index 8021c44..4fb78b3 100644 --- a/DataHandling/__init__.py +++ b/DataHandling/__init__.py @@ -1 +1,2 @@ from .converting import convert_to_numerical, convert_to_datetime, convert_to_boolean +from .loading import load_tabular_like_file, load_csv_like_file, load_excel_file diff --git a/DataHandling/loading.py b/DataHandling/loading.py new file mode 100644 index 0000000..affef5e --- /dev/null +++ b/DataHandling/loading.py @@ -0,0 +1,115 @@ +import itertools +from pathlib import Path +from typing import Union, Optional +import pandas as pd + +ENCODINGS = ['utf-8', 'latin-1', 'iso-8859-1', 'cp1252', 'ascii'] +SEPARATORS = [',', ';', '\t', '|'] +DECIMALS = [',', '.'] + + +def load_tabular_like_file(path: Union[str, Path], encoding: str = None, separator: str = None, + decimal: str = None, index_col: Union[str, int] = None) -> pd.DataFrame: + """ + Loads the data from the given path into a Pandas DataFrame. + + The function automatically detects the file format and loads the data accordingly. If the file format is not + supported, the function will raise a ValueError. Furthermore, corrupted files will be solved as far as possible. + + :param path: Path to the data file as string or Path object. + :param encoding: Encoding of the data file. If None, the function will try to detect the encoding. + :param separator: Separator of the data file. If None, the function will try to detect the separator. + :param decimal: Decimal separator of the data file. If None, the function will try to detect the decimal separator. + :param index_col: Column to use as index for the DataFrame. If None, no column will be used as index. + :raise: ValueError if the file format is not supported. + :raise: FileNotFoundError if the file does not exist. + :return: Pandas DataFrame with the loaded data. + """ + # Check if the given path is a string or a Path object + if not isinstance(path, Path): + path = Path(path) + + # Check if the given path exists + if not path.exists(): + raise FileNotFoundError(f'The given path {path} does not exist.') + + # Check if the given path is a file + if not path.is_file(): + raise ValueError(f'The given path {path} is not a file.') + + # Check if the given path is a csv file + if path.suffix.lower() in ['.csv', '.txt']: + data = load_csv_like_file(path=path, encoding=encoding, separator=separator, + decimal=decimal, index_col=index_col) + if data is None: + raise ValueError(f'The given file {path} is not a valid csv or txt file.') + return data + + # Check if the given path is an Excel file + elif path.suffix.lower() in ['.xlsx', '.xls', '.xlsm']: + data = load_excel_file(path=path, index_col=index_col) + if data is None: + raise ValueError(f'The given file {path} is not a valid excel file.') + return data + + # Raise an error if the file format is not supported + else: + raise ValueError(f'The given file format {path.suffix} is currently not supported.') + + +def load_csv_like_file(path: Path, encoding: str = None, separator: str = None, + decimal: str = None, index_col: Union[str, int] = None) -> Optional[pd.DataFrame]: + """ + Loads a csv or txt file into a Pandas DataFrame. + + The function automatically detects the encoding, separator and decimal separator of the file. If the file could not + be loaded, the function will return None. + + :param path: Path to the data file as string or Path object. + :param encoding: Encoding of the data file. If None, the function will try to detect the encoding. + :param separator: Separator of the data file. If None, the function will try to detect the separator. + :param decimal: Decimal separator of the data file. If None, the function will try to detect the decimal separator. + :param index_col: Column to use as index for the DataFrame. If None, no column will be used as index. + :return: Pandas DataFrame with the loaded data or None if the file could not be loaded. + """ + # Initialize an empty DataFrame + data = None + + # Use the given parameters to load the data + try: + data = pd.read_csv(path, encoding=encoding, sep=separator, + decimal=decimal if decimal is not None else '.', + index_col=index_col) + return data + except Exception: + pass + + # If the loading fails, try to detect the encoding, separator and decimal separator + for enc, sep, dec in itertools.product(ENCODINGS, SEPARATORS, DECIMALS): + try: + data = pd.read_csv(path, encoding=enc, sep=sep, decimal=dec) + return data + except Exception: + pass + + return data + + +def load_excel_file(path: Path, index_col: Union[str, int] = None) -> Optional[pd.DataFrame]: + """ + Loads an Excel file into a Pandas DataFrame. + + :param path: Path to the data file as string or Path object. + :param index_col: Column to use as index for the DataFrame. If None, no column will be used as index. + :return: Pandas DataFrame with the loaded data or None if the file could not be loaded. + """ + # Initialize an empty DataFrame + data = None + + # Use the given parameters to load the data + try: + data = pd.read_excel(path, index_col=index_col) + except Exception: + pass + + return data diff --git a/Tests/test_loading.py b/Tests/test_loading.py new file mode 100644 index 0000000..3258aa2 --- /dev/null +++ b/Tests/test_loading.py @@ -0,0 +1,33 @@ +import unittest +import pandas as pd +from pathlib import Path +from DataHandling import load_tabular_like_file + + +class TestLoading(unittest.TestCase): + def test_load_covid_data(self): + result = load_tabular_like_file('../data/Covid_Data.csv') + self.assertIsInstance(result, pd.DataFrame) + self.assertEqual(result.shape, (52429, 21)) + + def test_load_cycling_data(self): + result = load_tabular_like_file('../data/Cycling_Data.csv') + self.assertIsInstance(result, pd.DataFrame) + self.assertEqual(result.shape, (624, 28)) + + def test_load_passenger_data(self): + result = load_tabular_like_file('../data/Passenger_Stats.csv') + self.assertIsInstance(result, pd.DataFrame) + self.assertEqual(result.shape, (15007, 17)) + + def test_load_song_data(self): + path = Path('../data/Radio_Songs.csv') + result = load_tabular_like_file(path) + self.assertIsInstance(result, pd.DataFrame) + self.assertEqual(result.shape, (2918, 4)) + + def test_load_wine_data(self): + path = Path('../data/Wine_Malformed.csv') + result = load_tabular_like_file(path, separator=';') + self.assertIsInstance(result, pd.DataFrame) + self.assertEqual(result.shape, (4898, 5))