diff --git a/Code/merge_csv_files.ipynb b/Code/merge_csv_files.ipynb new file mode 100644 index 0000000..eea537b --- /dev/null +++ b/Code/merge_csv_files.ipynb @@ -0,0 +1,218 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Merging all CSV Files in a Directory into Single CSV Containing All Data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### lets first see what csv files are in the directory.." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import os" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Sales_December_2019.csv\n", + "Sales_April_2019.csv\n", + "Sales_February_2019.csv\n", + "Sales_March_2019.csv\n", + "Sales_August_2019.csv\n", + "Sales_May_2019.csv\n", + "Sales_November_2019.csv\n", + "Sales_October_2019.csv\n", + "Sales_January_2019.csv\n", + "Sales_September_2019.csv\n", + "Sales_July_2019.csv\n", + "Sales_June_2019.csv\n" + ] + } + ], + "source": [ + "files=[f for f in os.listdir(\"./SalesAnalysis/Sales_Data\") if f.endswith('.csv')]\n", + "all_data = pd.DataFrame()\n", + "\n", + "for file in files:\n", + " print(file)\n", + " #df = pd.read_csv('./csse_covid_19_daily_reports_us/' + file)\n", + " #all_data = pd.concat([all_data, df])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### ^ We just looked inside the directory and printed each file. Now, lets merge them into one single csv file containing all of our data.." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Order IDProductQuantity OrderedPrice EachOrder DatePurchase Address
0295665Macbook Pro Laptop1170012/30/19 00:01136 Church St, New York City, NY 10001
1295666LG Washing Machine1600.012/29/19 07:03562 2nd St, New York City, NY 10001
2295667USB-C Charging Cable111.9512/12/19 18:21277 Main St, New York City, NY 10001
329566827in FHD Monitor1149.9912/22/19 15:13410 6th St, San Francisco, CA 94016
4295669USB-C Charging Cable111.9512/18/19 12:3843 Hill St, Atlanta, GA 30301
\n", + "
" + ], + "text/plain": [ + " Order ID Product Quantity Ordered Price Each Order Date \\\n", + "0 295665 Macbook Pro Laptop 1 1700 12/30/19 00:01 \n", + "1 295666 LG Washing Machine 1 600.0 12/29/19 07:03 \n", + "2 295667 USB-C Charging Cable 1 11.95 12/12/19 18:21 \n", + "3 295668 27in FHD Monitor 1 149.99 12/22/19 15:13 \n", + "4 295669 USB-C Charging Cable 1 11.95 12/18/19 12:38 \n", + "\n", + " Purchase Address \n", + "0 136 Church St, New York City, NY 10001 \n", + "1 562 2nd St, New York City, NY 10001 \n", + "2 277 Main St, New York City, NY 10001 \n", + "3 410 6th St, San Francisco, CA 94016 \n", + "4 43 Hill St, Atlanta, GA 30301 " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "all_data = pd.DataFrame()\n", + "for file in files:\n", + " df = pd.read_csv('././SalesAnalysis/Sales_Data/' + file)\n", + " all_data = pd.concat([all_data, df])\n", + "\n", + "all_data.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### We just compressed each csv files' contents into a single csv and can now be used for analysis. You can verify that all contents were transferred by saving the new data frame ( in this case 'all_data') as a csv file using pd.to_csv('file_name') and then accessing it. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.7" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/README.md b/README.md index 91301b6..c5ebd0d 100644 --- a/README.md +++ b/README.md @@ -199,3 +199,6 @@ Sometimes you would need a functionality which is not directly provided by Keras (i.e. a neural network which takes input from multiple data sources, and does a combined training on this data), and you want that the data generator should be able to handle the data preparation on the fly, you can create a wrapper around ImageDataGenerator class to give the required output.[This notebook](./Code/CustomDataGen_Keras.ipynb) explains a simple solution to this usecase. 2. Another use case could be that you want to resize the images from a shape say 150x150 to a shape 224x224, which is generally utilized by the pretrained models, you can customize the ImageDataGenerator without coding your own data generator from ground up [(Example Notebook)](https://github.com/faizankshaikh/AV_Article_Codes/blob/master/Inception_From_Scratch/improvements/Inception_v1_from_Scratch.ipynb). + +- ### [Data Science Hack #45 Merging All CSV Files in a Directory into a Single CSV File Containing All Data](./Code/merge_csv_files.ipynb) +Sometimes our data isn't compiled into a single csv. To make analysis easier, transporting all data into one file is a must. Follow the link to see the simple procedure.