Skip to content

Commit

Permalink
data download complete
Browse files Browse the repository at this point in the history
  • Loading branch information
SaeedShurrab committed May 2, 2021
1 parent 9228e48 commit 8cf4832
Show file tree
Hide file tree
Showing 2 changed files with 103 additions and 16 deletions.
33 changes: 17 additions & 16 deletions data-download.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
import os
import zipfile
import kaggle
from zipfile import ZipFile
from kaggle.api.kaggle_api_extended import KaggleApi


data_dir = 'data'
sub_dirs = ['raw','intermediate','preprocessed']
Expand All @@ -10,31 +13,29 @@
try:
os.mkdir(data_dir)
except:
print(f'{data_dir} directory exists!!')
print(f'{data_dir} directory exists!! \n')


for dir in sub_dirs:
try:
os.mkdir(os.path.join(os.curdir,data_dir,dir))
except:
print(f'{dir} directory exists!!')
print(f'{dir} directory exists!! \n')



api = KaggleApi()
api.authenticate()


api.dataset_download_files('yazanshannak/us-covid-tweets')

print(f'download completed \n')

with ZipFile('us-covid-tweets.zip', 'r') as arch:
arch.extractall(os.path.join('.','data','raw'))

#UN data download
#UN_URL = 'https://justdata91.s3.us-east-2.amazonaws.com/UNv1.0.ar-en.ar.tar.xz'
#response = requests.get(UN_URL)
#UN_dir = os.path.join(raw_dir,data_sources[0])

#if response.status_code == 200:
# with open(os.path.join(UN_dir,'UNv1.0.ar-en.ar.tar.xz'), "wb+") as file:
# file.write(response.content)
# print("Download completed")
#else:
# print("Download Failed!!")
os.remove('us-covid-tweets.zip')

#with tarfile.open(os.path.join(UN_dir,'UNv1.0.ar-en.ar.tar.xz'),'r') as archive:
# archive.extractall(UN_dir)
# os.remove(os.path.join(UN_dir,'UNv1.0.ar-en.ar.tar.xz'))
print('Raw data at your service \n')
86 changes: 86 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
argon2-cffi==20.1.0
async-generator==1.10
attrs==20.3.0
backcall==0.2.0
bleach==3.3.0
certifi==2020.12.5
cffi==1.14.5
chardet==4.0.0
click==7.1.2
cycler==0.10.0
decorator==5.0.7
defusedxml==0.7.1
entrypoints==0.3
funcy==1.15
future==0.18.2
gensim==4.0.1
idna==2.10
ipykernel==5.5.3
ipython==7.23.0
ipython-genutils==0.2.0
ipywidgets==7.6.3
jedi==0.18.0
Jinja2==2.11.3
joblib==1.0.1
jsonschema==3.2.0
jupyter==1.0.0
jupyter-client==6.1.12
jupyter-console==6.4.0
jupyter-core==4.7.1
jupyterlab-pygments==0.1.2
jupyterlab-widgets==1.0.0
kaggle==1.5.12
kiwisolver==1.3.1
langdetect==1.0.8
MarkupSafe==1.1.1
matplotlib==3.4.1
matplotlib-inline==0.1.2
mistune==0.8.4
nbclient==0.5.3
nbconvert==6.0.7
nbformat==5.1.3
nest-asyncio==1.5.1
nltk==3.6.2
notebook==6.3.0
numexpr==2.7.3
numpy==1.20.2
packaging==20.9
pandas==1.2.4
pandocfilters==1.4.3
parso==0.8.2
pexpect==4.8.0
pickleshare==0.7.5
Pillow==8.2.0
prometheus-client==0.10.1
prompt-toolkit==3.0.18
ptyprocess==0.7.0
pycparser==2.20
Pygments==2.8.1
pyLDAvis==3.3.1
pyparsing==2.4.7
pyrsistent==0.17.3
python-dateutil==2.8.1
python-slugify==5.0.0
pytz==2021.1
pyzmq==22.0.3
qtconsole==5.0.3
QtPy==1.9.0
regex==2021.4.4
requests==2.25.1
scikit-learn==0.24.2
scipy==1.6.3
Send2Trash==1.5.0
six==1.15.0
sklearn==0.0
smart-open==5.0.0
terminado==0.9.4
testpath==0.4.4
text-unidecode==1.3
threadpoolctl==2.1.0
tornado==6.1
tqdm==4.60.0
traitlets==5.0.5
urllib3==1.26.4
wcwidth==0.2.5
webencodings==0.5.1
widgetsnbextension==3.5.1

0 comments on commit 8cf4832

Please sign in to comment.