Skip to content

Commit 51926c1

Browse files
authored
Merge pull request #112 from WangKehanK/Final_Deliverable
Final deliverable
2 parents e5f0265 + 0b48e7d commit 51926c1

File tree

9 files changed

+8769
-0
lines changed

9 files changed

+8769
-0
lines changed

t_visa_trends_team1/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
.ipynb_checkpoints
Lines changed: 224 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,224 @@
1+
import re
2+
import requests
3+
import os
4+
from bs4 import BeautifulSoup
5+
import pandas as pd
6+
import subprocess
7+
from tqdm import tqdm
8+
9+
import PyPDF2
10+
from pdfminer.pdfparser import PDFParser, PDFDocument
11+
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
12+
from pdfminer.converter import PDFPageAggregator
13+
from pdfminer.layout import LAParams, LTTextBox
14+
from pdfminer.pdfinterp import PDFTextExtractionNotAllowed
15+
16+
# Global variable
17+
INDEX_FILE = './pdf_name_url.csv'
18+
PDF_DIR = './download_pdf'
19+
if not os.path.exists(PDF_DIR):
20+
os.mkdir(PDF_DIR)
21+
RESULT_FILE = './result.csv'
22+
23+
# spider
24+
def spider_url_and_download_pdf(update_index=True):
25+
"""loop through the target site
26+
27+
update_index: True: scrape again; False: no need update
28+
"""
29+
PAGE_NUM = 1 # page number
30+
NULL_PAGE_DISP = 'Sorry, no results found'
31+
TITLE_MUST_DISP = 'Application for T Nonimmigrant Status'
32+
33+
if os.path.exists(INDEX_FILE):
34+
DATA_HIST = [(na, url) for na, url in pd.read_csv(INDEX_FILE).values]
35+
else:
36+
DATA_HIST = []
37+
38+
# update_index == True, the crawl will loop through the site again, and check if there is any updates
39+
if update_index:
40+
DATA_NEW = []
41+
while True:
42+
url = f'https://search.usa.gov/search?affiliate=uscis-aao&dc=1847&page={PAGE_NUM}&query=form+I-914&search=Search&utf8=%E2%9C%93'
43+
try:
44+
res = requests.get(url)
45+
46+
# no more data
47+
if NULL_PAGE_DISP in res.text:
48+
print('No more data, spider done.')
49+
break
50+
51+
soup = BeautifulSoup(res.text, 'html.parser')
52+
divs = soup.find_all('div', attrs={'class': 'content-block-item result'})
53+
54+
title_urls = [re.split('\n+', d.text.strip())[1:3] for d in divs]
55+
valid_title_urls = [(title, url) for title, url in title_urls
56+
if TITLE_MUST_DISP in title]
57+
new_data = [tup for tup in valid_title_urls if tup not in DATA_HIST]
58+
print(f'Page {PAGE_NUM}, find {len(new_data)} new url.')
59+
60+
DATA_NEW.extend(new_data)
61+
PAGE_NUM += 1
62+
63+
except Exception:
64+
import traceback
65+
traceback.print_exc()
66+
import ipdb
67+
ipdb.set_trace()
68+
69+
# update hist_file
70+
DATA_HIST.extend(DATA_NEW)
71+
pd.DataFrame(DATA_HIST, columns=['title', 'url']).drop_duplicates().to_csv(INDEX_FILE,
72+
index=None)
73+
74+
# download pdf file
75+
df_hist = pd.read_csv(INDEX_FILE)
76+
for title, url in tqdm(df_hist.values, desc='pdf download'):
77+
try:
78+
fname = os.path.basename(url)
79+
fpath = os.path.join(PDF_DIR, fname)
80+
if os.path.exists(fpath):
81+
continue
82+
subprocess.check_call(['wget', url], cwd=PDF_DIR, stderr=subprocess.DEVNULL)
83+
84+
except Exception:
85+
print(f'{url} download failed')
86+
87+
88+
def parse_by_pdfminer(pdf_path):
89+
90+
praser = PDFParser(open(pdf_path, 'rb'))
91+
doc = PDFDocument()
92+
praser.set_document(doc)
93+
doc.set_parser(praser)
94+
95+
doc.initialize()
96+
97+
if not doc.is_extractable:
98+
raise PDFTextExtractionNotAllowed
99+
else:
100+
rsrcmgr = PDFResourceManager()
101+
laparams = LAParams()
102+
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
103+
interpreter = PDFPageInterpreter(rsrcmgr, device)
104+
105+
page_txts = []
106+
for page in doc.get_pages():
107+
interpreter.process_page(page)
108+
layout = device.get_result()
109+
layout_txts = []
110+
for x in layout:
111+
if isinstance(x, LTTextBox):
112+
layout_txts.append(x.get_text().strip())
113+
page_txts.append('\n'.join(layout_txts))
114+
115+
return page_txts
116+
117+
118+
def parse_by_PyPDF2(pdf_path):
119+
120+
mypdf = open(pdf_path, mode='rb')
121+
122+
pdf_document = PyPDF2.PdfFileReader(mypdf)
123+
124+
page_num = pdf_document.numPages
125+
page_txts = []
126+
for page in range(page_num):
127+
txt = pdf_document.getPage(page).extractText()
128+
page_txts.append(txt)
129+
130+
return page_txts
131+
132+
133+
def extract_info(page_txts):
134+
135+
# 1. get id
136+
# Get match in first page: "In Re: 9435010" or last page: "ID# 1940904"
137+
match1 = re.search('In Re: (\d+)', page_txts[0], re.IGNORECASE)
138+
match2 = re.search('# (\d+)', page_txts[-1], re.IGNORECASE)
139+
140+
if match1:
141+
ID = match1.group(1)
142+
elif match2:
143+
ID = match2.group(1)
144+
else:
145+
ID = None
146+
147+
# 2. date decision status
148+
# match_date = re.search('DA\s*TE.*?:(.*?\d{4})', page_txts[0], re.IGNORECASE)
149+
# date = match_date.group(1).strip() if match_date else None
150+
151+
match_decision = re.search('(appeal|motion).*?decision', page_txts[0], re.IGNORECASE)
152+
decision = match_decision.group() if match_decision else None
153+
154+
match_status = re.search('FORM [i1l]-914.*?status', page_txts[0], re.IGNORECASE)
155+
status = match_status.group() if match_status else None
156+
status = re.sub('[il1]-', 'I-', status, re.IGNORECASE)
157+
158+
match_order = re.search('ORDER:.*?(The.*?\.)', page_txts[-1], re.IGNORECASE)
159+
order = match_order.group(1).strip() if match_order else None
160+
161+
match_desc = re.search('(The Applicant.*?)(\s+I\.|$)', page_txts[0], re.IGNORECASE | re.S)
162+
desc = match_desc.group(1).strip() if match_desc else None
163+
# if not desc:
164+
# raise ValueError
165+
166+
match_is_family = re.search('FAMILY MEMBER', page_txts[0], re.IGNORECASE)
167+
is_family = True if match_is_family else False
168+
169+
return {
170+
'ID': ID,
171+
'decision': decision,
172+
'status': status,
173+
'order': order,
174+
'is_family': is_family,
175+
'desc': desc,
176+
}
177+
178+
179+
def parse_pdf_info(tool='PyPDF2'):
180+
181+
tool2func = {'pdfminer': parse_by_pdfminer, 'PyPDF2': parse_by_PyPDF2}
182+
183+
BASE_INFO = {
184+
'url': None,
185+
'file_name': None,
186+
'ID': None,
187+
'date': None,
188+
'decision': None,
189+
'status': None,
190+
'order': None,
191+
'is_family': None,
192+
}
193+
ALL_DATA = []
194+
urls = pd.read_csv(INDEX_FILE).url
195+
for url in tqdm(urls, desc='Parse pdf file'):
196+
197+
data = BASE_INFO.copy()
198+
199+
fname = os.path.basename(url)
200+
data['file_name'] = fname
201+
data['url'] = url
202+
data['date'] = fname.split('_')[0]
203+
204+
fp = os.path.join(PDF_DIR, fname)
205+
try:
206+
page_txts = tool2func[tool](fp)
207+
extrat_dict = extract_info(page_txts)
208+
data.update(extrat_dict)
209+
210+
ALL_DATA.append(data)
211+
212+
except:
213+
import traceback
214+
traceback.print_exc()
215+
print(f'{fname}failed')
216+
import ipdb
217+
ipdb.set_trace()
218+
219+
pd.DataFrame(ALL_DATA).to_csv(RESULT_FILE, index=None)
220+
221+
222+
if __name__ == "__main__":
223+
spider_url_and_download_pdf(update_index=True)
224+
parse_pdf_info()

0 commit comments

Comments
 (0)