From 750b8e6294a0acb150f066d544f3a8744f3c7dc0 Mon Sep 17 00:00:00 2001
From: NilsHMeier <113688185+ErasonMeier@users.noreply.github.com>
Date: Mon, 30 Jan 2023 15:19:48 +0100
Subject: [PATCH 1/3] Add methods to load tabular like data - Update
 __init__.py

---
 DataHandling/loading.py | 113 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 113 insertions(+)
 create mode 100644 DataHandling/loading.py

diff --git a/DataHandling/loading.py b/DataHandling/loading.py
new file mode 100644
index 0000000..75c83ee
--- /dev/null
+++ b/DataHandling/loading.py
@@ -0,0 +1,113 @@
+import itertools
+from pathlib import Path
+from typing import Union, Optional
+import pandas as pd
+
+ENCODINGS = ['utf-8', 'latin-1', 'iso-8859-1', 'cp1252', 'ascii']
+SEPARATORS = [',', ';', '\t', '|']
+DECIMALS = [',', '.']
+
+
+def load_tabular_like_file(path: Union[str, Path], encoding: str = None, separator: str = None,
+                           decimal: str = None, index_col: Union[str, int] = None) -> pd.DataFrame:
+    """
+    Loads the data from the given path into a Pandas DataFrame.
+
+    The function automatically detects the file format and loads the data accordingly. If the file format is not
+    supported, the function will raise a ValueError. Furthermore, corrupted files will be solved as far as possible.
+
+    :param path: Path to the data file as string or Path object.
+    :param encoding: Encoding of the data file. If None, the function will try to detect the encoding.
+    :param separator: Separator of the data file. If None, the function will try to detect the separator.
+    :param decimal: Decimal separator of the data file. If None, the function will try to detect the decimal separator.
+    :param index_col: Column to use as index for the DataFrame. If None, no column will be used as index.
+    :raise: ValueError if the file format is not supported.
+    :raise: FileNotFoundError if the file does not exist.
+    :return: Pandas DataFrame with the loaded data.
+    """
+    # Check if the given path is a string or a Path object
+    if not isinstance(path, Path):
+        path = Path(path)
+
+    # Check if the given path exists
+    if not path.exists():
+        raise FileNotFoundError(f'The given path {path} does not exist.')
+
+    # Check if the given path is a file
+    if not path.is_file():
+        raise ValueError(f'The given path {path} is not a file.')
+
+    # Check if the given path is a csv file
+    if path.suffix.lower() in ['.csv', '.txt']:
+        data = load_csv_like_file(path=path, encoding=encoding, separator=separator,
+                                  decimal=decimal, index_col=index_col)
+        if data is None:
+            raise ValueError(f'The given file {path} is not a valid csv or txt file.')
+        return data
+
+    # Check if the given path is an Excel file
+    elif path.suffix.lower() in ['.xlsx', '.xls', '.xlsm']:
+        data = load_excel_file(path=path, index_col=index_col)
+        if data is None:
+            raise ValueError(f'The given file {path} is not a valid excel file.')
+        return data
+
+    # Raise an error if the file format is not supported
+    else:
+        raise ValueError(f'The given file format {path.suffix} is currently not supported.')
+
+
+def load_csv_like_file(path: Path, encoding: str = None, separator: str = None,
+                       decimal: str = None, index_col: Union[str, int] = None) -> Optional[pd.DataFrame]:
+    """
+    Loads a csv or txt file into a Pandas DataFrame.
+
+    The function automatically detects the encoding, separator and decimal separator of the file. If the file could not
+    be loaded, the function will return None.
+
+    :param path: Path to the data file as string or Path object.
+    :param encoding: Encoding of the data file. If None, the function will try to detect the encoding.
+    :param separator: Separator of the data file. If None, the function will try to detect the separator.
+    :param decimal: Decimal separator of the data file. If None, the function will try to detect the decimal separator.
+    :param index_col: Column to use as index for the DataFrame. If None, no column will be used as index.
+    :return: Pandas DataFrame with the loaded data or None if the file could not be loaded.
+    """
+    # Initialize an empty DataFrame
+    data = None
+
+    # Use the given parameters to load the data
+    try:
+        data = pd.read_csv(path, encoding=encoding, sep=separator, decimal=decimal, index_col=index_col)
+        return data
+    except Exception:
+        pass
+
+    # If the loading fails, try to detect the encoding, separator and decimal separator
+    for enc, sep, dec in itertools.product(ENCODINGS, SEPARATORS, DECIMALS):
+        try:
+            data = pd.read_csv(path, encoding=enc, sep=sep, decimal=dec)
+            return data
+        except Exception:
+            pass
+
+    return data
+
+
+def load_excel_file(path: Path, index_col: Union[str, int] = None) -> Optional[pd.DataFrame]:
+    """
+    Loads an Excel file into a Pandas DataFrame.
+
+    :param path: Path to the data file as string or Path object.
+    :param index_col: Column to use as index for the DataFrame. If None, no column will be used as index.
+    :return: Pandas DataFrame with the loaded data or None if the file could not be loaded.
+    """
+    # Initialize an empty DataFrame
+    data = None
+
+    # Use the given parameters to load the data
+    try:
+        data = pd.read_excel(path, index_col=index_col)
+    except Exception:
+        pass
+
+    return data

From 5038c5ea6c6ddb91a30ecccfce98f3760775ad19 Mon Sep 17 00:00:00 2001
From: NilsHMeier <113688185+ErasonMeier@users.noreply.github.com>
Date: Mon, 30 Jan 2023 15:37:09 +0100
Subject: [PATCH 2/3] - Update __init__.py

---
 DataHandling/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/DataHandling/__init__.py b/DataHandling/__init__.py
index 8021c44..4fb78b3 100644
--- a/DataHandling/__init__.py
+++ b/DataHandling/__init__.py
@@ -1 +1,2 @@
 from .converting import convert_to_numerical, convert_to_datetime, convert_to_boolean
+from .loading import load_tabular_like_file, load_csv_like_file, load_excel_file

From ce1e2911c06831fcbf7753fbe1e3821f2b9860b3 Mon Sep 17 00:00:00 2001
From: NilsHMeier <113688185+ErasonMeier@users.noreply.github.com>
Date: Sun, 5 Feb 2023 16:38:53 +0100
Subject: [PATCH 3/3] Add testcases for data loading - Fix error due to
 separator value set to None

---
 DataHandling/loading.py |  4 +++-
 Tests/test_loading.py   | 33 +++++++++++++++++++++++++++++++++
 2 files changed, 36 insertions(+), 1 deletion(-)
 create mode 100644 Tests/test_loading.py

diff --git a/DataHandling/loading.py b/DataHandling/loading.py
index 75c83ee..affef5e 100644
--- a/DataHandling/loading.py
+++ b/DataHandling/loading.py
@@ -77,7 +77,9 @@ def load_csv_like_file(path: Path, encoding: str = None, separator: str = None,
 
     # Use the given parameters to load the data
     try:
-        data = pd.read_csv(path, encoding=encoding, sep=separator, decimal=decimal, index_col=index_col)
+        data = pd.read_csv(path, encoding=encoding, sep=separator,
+                           decimal=decimal if decimal is not None else '.',
+                           index_col=index_col)
         return data
     except Exception:
         pass
diff --git a/Tests/test_loading.py b/Tests/test_loading.py
new file mode 100644
index 0000000..3258aa2
--- /dev/null
+++ b/Tests/test_loading.py
@@ -0,0 +1,33 @@
+import unittest
+import pandas as pd
+from pathlib import Path
+from DataHandling import load_tabular_like_file
+
+
+class TestLoading(unittest.TestCase):
+    def test_load_covid_data(self):
+        result = load_tabular_like_file('../data/Covid_Data.csv')
+        self.assertIsInstance(result, pd.DataFrame)
+        self.assertEqual(result.shape, (52429, 21))
+
+    def test_load_cycling_data(self):
+        result = load_tabular_like_file('../data/Cycling_Data.csv')
+        self.assertIsInstance(result, pd.DataFrame)
+        self.assertEqual(result.shape, (624, 28))
+
+    def test_load_passenger_data(self):
+        result = load_tabular_like_file('../data/Passenger_Stats.csv')
+        self.assertIsInstance(result, pd.DataFrame)
+        self.assertEqual(result.shape, (15007, 17))
+
+    def test_load_song_data(self):
+        path = Path('../data/Radio_Songs.csv')
+        result = load_tabular_like_file(path)
+        self.assertIsInstance(result, pd.DataFrame)
+        self.assertEqual(result.shape, (2918, 4))
+
+    def test_load_wine_data(self):
+        path = Path('../data/Wine_Malformed.csv')
+        result = load_tabular_like_file(path, separator=';')
+        self.assertIsInstance(result, pd.DataFrame)
+        self.assertEqual(result.shape, (4898, 5))