-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_scrape_pipeline.py
96 lines (87 loc) · 3.12 KB
/
run_scrape_pipeline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# TODO: run only for first 10 links. Keep on updating the csv file for wikipedia links. Get links only from the file.
import requests
from bs4 import BeautifulSoup
from pathlib import Path
import os
import time
import math
import random
import pandas as pd
# import pdfkit
acceptable_langs = ['Kannada','Bangla','Marathi','Odia','Urdu','Tamil','Hindi','Assamese','Malayalam','Telugu','Punjabi','Gujarati']
acceptable_lang_c = ['kn','bn','mr','or','ur','ta','hi','ml','te','pa','gu']
next_links = ['/wiki/India']
def get_next_links(url):
r = requests.get(url)
soup = BeautifulSoup(r.content,"html.parser")
div = soup.find('div',attrs={'id':'content'})
aas = div.find_all('a',href=True)
for a in aas:
if(a is None):
continue
elif('/wiki/File:' in a['href']):
continue
elif('/wiki/Wikipedia:' in a['href'] or '/wiki/Category:' in a['href'] or 'Portal:' in a['href'] or 'Template' in a['href']):
continue
elif('#' in a['href']):
continue
elif('_(disambiguation)' in a['href']):
continue
elif('/wiki/'in a['href'] and a['href'] not in next_links ):
next_links.append(a['href'])
def get_id():
uuid = str(hex(math.floor((2+random.random()) * 0x80000000)))[2:8]
return uuid
def scrape_and_write(url,file_name):
text = ''
r = requests.get(url)
soup = BeautifulSoup(r.content,'html.parser')
div = soup.find('div',attrs={'id':'content'})
f = open(file_name,'w',encoding='utf-16')
p_tags = div.find_all('p')
for p in p_tags:
if(p.get_text().strip() != ''):
sups = p.find_all('sup',attrs={'class':'reference'})
text += ' '.join(p.get_text().strip().split())
for sup in sups:
text = text.replace(sup.get_text(),'')
text += ' '
f.write(text)
f.close()
def get_pdf(uuid,url,lang):
pdfkit.from_url(url,os.path.join('PDF',uuid+'-'+lang+'.pdf'))
def get_other_langs(uuid,url,path):
r = requests.get(url)
soup = BeautifulSoup(r.content,'html.parser')
nav = soup.find('nav',attrs={'id':'p-lang'})
lis = nav.find_all('li')
for li in lis:
a = li.find('a')
lang = a['lang']
if(lang in acceptable_lang_c):
scrape_and_write(a['href'],os.path.join(path,uuid+'-'+lang+'.txt'))
def main():
something = []
path = 'Scraped_Files'
Path(path).mkdir(parents=True,exist_ok=True)
for i,u in enumerate(next_links):
# print(next_links)
if(i==75):
break
print(u)
base_url = 'https://en.wikipedia.org'
url = base_url+u
uuid = get_id()
something.append([url,uuid])
scrape_and_write(url,os.path.join(path,uuid+'-en.txt'))
get_other_langs(uuid,url,path)
get_next_links(url)
# break
print(len(next_links))
df = pd.DataFrame(next_links,columns=['Links'])
df.to_csv('Next_Links.csv',index=False)
df = pd.DataFrame(something,columns=['URL','UUID'])
df.to_csv('Links_to_UUID.csv',index=False)
if __name__ == '__main__':
Path('PDF').mkdir(parents=True,exist_ok=True)
main()