Skip to content
This repository was archived by the owner on May 4, 2021. It is now read-only.

Commit 8d09dfa

Browse files
author
Tim Reichelt
committed
Add parsing of .kv.gz files
1 parent cee4e5e commit 8d09dfa

File tree

1 file changed

+29
-11
lines changed

1 file changed

+29
-11
lines changed

baseline/locate_candidates_cc_index_api.py

Lines changed: 29 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@
1010
COMMONCRAWL_S3_URL = "https://commoncrawl.s3.amazonaws.com"
1111
COMMONCRAWL_INDEX_URL = "http://index.commoncrawl.org"
1212

13+
INVALID_URL = "123"
14+
INVALID_CRAWL = "abc"
15+
1316
def make_full_filename(filepath):
1417
return '/'.join([COMMONCRAWL_S3_URL, filepath])
1518

@@ -26,8 +29,12 @@ def make_query_url(crawl, url):
2629
def get_location(session, url, crawl):
2730
""" Returns success and location """
2831
query_url = make_query_url(crawl, url)
29-
r = session.get(query_url)
30-
result = r.json()
32+
try:
33+
r = session.get(query_url)
34+
result = r.json()
35+
except:
36+
return False, None
37+
3138
try:
3239
data = {
3340
"filename": make_full_filename(result["filename"]),
@@ -54,34 +61,45 @@ def report_error(url, crawl, errors, total):
5461
help='file containing candidates')
5562
parser.add_argument('outfile', type=argparse.FileType('w'),
5663
default=sys.stdout)
57-
parser.add_argument('-slang', help='source language (e.g. en)',
58-
default='en')
59-
parser.add_argument('-tlang', help='source language (e.g. it)',
60-
default='it')
64+
parser.add_argument('-kv', help='input is a .kv.gz file',
65+
default=False, action="store_true")
6166
args = parser.parse_args(sys.argv[1:])
6267

6368
total_lines, total_errors = 0, 0
6469
with requests.Session() as session:
6570
for line in args.candidates:
6671
total_lines += 1
6772
line = line.decode("utf-8")
68-
_, src_url, src_crawl, tgt_url, tgt_crawl = line.strip().split()
73+
if args.kv:
74+
# Lines have the format:
75+
# {domain} {url} {crawl}\t{language_data}
76+
url_data, _ = line.strip().split('\t')
77+
_, src_url, src_crawl = url_data.strip().split()
78+
tgt_success = False
79+
else:
80+
# Lines have the format:
81+
# {stripped_url} {src_url} {src_crawl} {tgt_url} {tgt_crawl}
82+
_, src_url, src_crawl, tgt_url, tgt_crawl = line.strip().split()
6983

7084
src_success, src_loc = get_location(session, src_url, src_crawl)
7185
if not src_success:
7286
total_errors += 1
7387
report_error(src_url, src_crawl, total_errors, total_lines)
7488

75-
tgt_success, tgt_loc = get_location(session, tgt_url, tgt_crawl)
76-
if not tgt_success:
77-
total_errors += 1
78-
report_error(tgt_url, tgt_crawl, total_errors, total_lines)
89+
if not args.kv:
90+
tgt_success, tgt_loc = get_location(session, tgt_url, tgt_crawl)
91+
if not tgt_success:
92+
total_errors += 1
93+
report_error(tgt_url, tgt_crawl, total_errors, total_lines)
7994

8095
if src_success and tgt_success:
8196
args.outfile.write("%s\t%s\t%s\n" %
8297
(src_url, src_crawl, json.dumps(src_loc)))
8398
args.outfile.write("%s\t%s\t%s\n" %
8499
(tgt_url, tgt_crawl, json.dumps(tgt_loc)))
100+
elif args.kv and src_success:
101+
args.outfile.write("%s\t%s\t%s\n" %
102+
(src_url, src_crawl, json.dumps(src_loc)))
85103

86104
sys.stderr.write("Done: ")
87105
report_error(tgt_url, tgt_crawl, total_errors, total_lines)

0 commit comments

Comments
 (0)