Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions DataHandling/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
from .converting import convert_to_numerical, convert_to_datetime, convert_to_boolean
from .loading import load_tabular_like_file, load_csv_like_file, load_excel_file
115 changes: 115 additions & 0 deletions DataHandling/loading.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
import itertools
from pathlib import Path
from typing import Union, Optional
import pandas as pd

ENCODINGS = ['utf-8', 'latin-1', 'iso-8859-1', 'cp1252', 'ascii']
SEPARATORS = [',', ';', '\t', '|']
DECIMALS = [',', '.']


def load_tabular_like_file(path: Union[str, Path], encoding: str = None, separator: str = None,
decimal: str = None, index_col: Union[str, int] = None) -> pd.DataFrame:
"""
Loads the data from the given path into a Pandas DataFrame.

The function automatically detects the file format and loads the data accordingly. If the file format is not
supported, the function will raise a ValueError. Furthermore, corrupted files will be solved as far as possible.

:param path: Path to the data file as string or Path object.
:param encoding: Encoding of the data file. If None, the function will try to detect the encoding.
:param separator: Separator of the data file. If None, the function will try to detect the separator.
:param decimal: Decimal separator of the data file. If None, the function will try to detect the decimal separator.
:param index_col: Column to use as index for the DataFrame. If None, no column will be used as index.
:raise: ValueError if the file format is not supported.
:raise: FileNotFoundError if the file does not exist.
:return: Pandas DataFrame with the loaded data.
"""
# Check if the given path is a string or a Path object
if not isinstance(path, Path):
path = Path(path)

# Check if the given path exists
if not path.exists():
raise FileNotFoundError(f'The given path {path} does not exist.')

# Check if the given path is a file
if not path.is_file():
raise ValueError(f'The given path {path} is not a file.')

# Check if the given path is a csv file
if path.suffix.lower() in ['.csv', '.txt']:
data = load_csv_like_file(path=path, encoding=encoding, separator=separator,
decimal=decimal, index_col=index_col)
if data is None:
raise ValueError(f'The given file {path} is not a valid csv or txt file.')
return data

# Check if the given path is an Excel file
elif path.suffix.lower() in ['.xlsx', '.xls', '.xlsm']:
data = load_excel_file(path=path, index_col=index_col)
if data is None:
raise ValueError(f'The given file {path} is not a valid excel file.')
return data

# Raise an error if the file format is not supported
else:
raise ValueError(f'The given file format {path.suffix} is currently not supported.')


def load_csv_like_file(path: Path, encoding: str = None, separator: str = None,
decimal: str = None, index_col: Union[str, int] = None) -> Optional[pd.DataFrame]:
"""
Loads a csv or txt file into a Pandas DataFrame.

The function automatically detects the encoding, separator and decimal separator of the file. If the file could not
be loaded, the function will return None.

:param path: Path to the data file as string or Path object.
:param encoding: Encoding of the data file. If None, the function will try to detect the encoding.
:param separator: Separator of the data file. If None, the function will try to detect the separator.
:param decimal: Decimal separator of the data file. If None, the function will try to detect the decimal separator.
:param index_col: Column to use as index for the DataFrame. If None, no column will be used as index.
:return: Pandas DataFrame with the loaded data or None if the file could not be loaded.
"""
# Initialize an empty DataFrame
data = None

# Use the given parameters to load the data
try:
data = pd.read_csv(path, encoding=encoding, sep=separator,
decimal=decimal if decimal is not None else '.',
index_col=index_col)
return data
except Exception:
pass

# If the loading fails, try to detect the encoding, separator and decimal separator
for enc, sep, dec in itertools.product(ENCODINGS, SEPARATORS, DECIMALS):
try:
data = pd.read_csv(path, encoding=enc, sep=sep, decimal=dec)
return data
except Exception:
pass

return data


def load_excel_file(path: Path, index_col: Union[str, int] = None) -> Optional[pd.DataFrame]:
"""
Loads an Excel file into a Pandas DataFrame.

:param path: Path to the data file as string or Path object.
:param index_col: Column to use as index for the DataFrame. If None, no column will be used as index.
:return: Pandas DataFrame with the loaded data or None if the file could not be loaded.
"""
# Initialize an empty DataFrame
data = None

# Use the given parameters to load the data
try:
data = pd.read_excel(path, index_col=index_col)
except Exception:
pass

return data
33 changes: 33 additions & 0 deletions Tests/test_loading.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import unittest
import pandas as pd
from pathlib import Path
from DataHandling import load_tabular_like_file


class TestLoading(unittest.TestCase):
def test_load_covid_data(self):
result = load_tabular_like_file('../data/Covid_Data.csv')
self.assertIsInstance(result, pd.DataFrame)
self.assertEqual(result.shape, (52429, 21))

def test_load_cycling_data(self):
result = load_tabular_like_file('../data/Cycling_Data.csv')
self.assertIsInstance(result, pd.DataFrame)
self.assertEqual(result.shape, (624, 28))

def test_load_passenger_data(self):
result = load_tabular_like_file('../data/Passenger_Stats.csv')
self.assertIsInstance(result, pd.DataFrame)
self.assertEqual(result.shape, (15007, 17))

def test_load_song_data(self):
path = Path('../data/Radio_Songs.csv')
result = load_tabular_like_file(path)
self.assertIsInstance(result, pd.DataFrame)
self.assertEqual(result.shape, (2918, 4))

def test_load_wine_data(self):
path = Path('../data/Wine_Malformed.csv')
result = load_tabular_like_file(path, separator=';')
self.assertIsInstance(result, pd.DataFrame)
self.assertEqual(result.shape, (4898, 5))