1010COMMONCRAWL_S3_URL = "https://commoncrawl.s3.amazonaws.com"
1111COMMONCRAWL_INDEX_URL = "http://index.commoncrawl.org"
1212
13+ INVALID_URL = "123"
14+ INVALID_CRAWL = "abc"
15+
1316def make_full_filename (filepath ):
1417 return '/' .join ([COMMONCRAWL_S3_URL , filepath ])
1518
@@ -26,8 +29,12 @@ def make_query_url(crawl, url):
2629def get_location (session , url , crawl ):
2730 """ Returns success and location """
2831 query_url = make_query_url (crawl , url )
29- r = session .get (query_url )
30- result = r .json ()
32+ try :
33+ r = session .get (query_url )
34+ result = r .json ()
35+ except :
36+ return False , None
37+
3138 try :
3239 data = {
3340 "filename" : make_full_filename (result ["filename" ]),
@@ -54,34 +61,45 @@ def report_error(url, crawl, errors, total):
5461 help = 'file containing candidates' )
5562 parser .add_argument ('outfile' , type = argparse .FileType ('w' ),
5663 default = sys .stdout )
57- parser .add_argument ('-slang' , help = 'source language (e.g. en)' ,
58- default = 'en' )
59- parser .add_argument ('-tlang' , help = 'source language (e.g. it)' ,
60- default = 'it' )
64+ parser .add_argument ('-kv' , help = 'input is a .kv.gz file' ,
65+ default = False , action = "store_true" )
6166 args = parser .parse_args (sys .argv [1 :])
6267
6368 total_lines , total_errors = 0 , 0
6469 with requests .Session () as session :
6570 for line in args .candidates :
6671 total_lines += 1
6772 line = line .decode ("utf-8" )
68- _ , src_url , src_crawl , tgt_url , tgt_crawl = line .strip ().split ()
73+ if args .kv :
74+ # Lines have the format:
75+ # {domain} {url} {crawl}\t{language_data}
76+ url_data , _ = line .strip ().split ('\t ' )
77+ _ , src_url , src_crawl = url_data .strip ().split ()
78+ tgt_success = False
79+ else :
80+ # Lines have the format:
81+ # {stripped_url} {src_url} {src_crawl} {tgt_url} {tgt_crawl}
82+ _ , src_url , src_crawl , tgt_url , tgt_crawl = line .strip ().split ()
6983
7084 src_success , src_loc = get_location (session , src_url , src_crawl )
7185 if not src_success :
7286 total_errors += 1
7387 report_error (src_url , src_crawl , total_errors , total_lines )
7488
75- tgt_success , tgt_loc = get_location (session , tgt_url , tgt_crawl )
76- if not tgt_success :
77- total_errors += 1
78- report_error (tgt_url , tgt_crawl , total_errors , total_lines )
89+ if not args .kv :
90+ tgt_success , tgt_loc = get_location (session , tgt_url , tgt_crawl )
91+ if not tgt_success :
92+ total_errors += 1
93+ report_error (tgt_url , tgt_crawl , total_errors , total_lines )
7994
8095 if src_success and tgt_success :
8196 args .outfile .write ("%s\t %s\t %s\n " %
8297 (src_url , src_crawl , json .dumps (src_loc )))
8398 args .outfile .write ("%s\t %s\t %s\n " %
8499 (tgt_url , tgt_crawl , json .dumps (tgt_loc )))
100+ elif args .kv and src_success :
101+ args .outfile .write ("%s\t %s\t %s\n " %
102+ (src_url , src_crawl , json .dumps (src_loc )))
85103
86104 sys .stderr .write ("Done: " )
87105 report_error (tgt_url , tgt_crawl , total_errors , total_lines )
0 commit comments