Add parsing of .kv.gz files

Tim Reichelt · Tim Reichelt · commit 8d09dfa3a575 · 2017-08-01T13:04:09.000Z
diff --git a/baseline/locate_candidates_cc_index_api.py b/baseline/locate_candidates_cc_index_api.py
@@ -10,6 +10,9 @@
 COMMONCRAWL_S3_URL = "https://commoncrawl.s3.amazonaws.com"
 COMMONCRAWL_INDEX_URL = "http://index.commoncrawl.org"
 
+INVALID_URL = "123"
+INVALID_CRAWL = "abc"
+
 def make_full_filename(filepath):
     return '/'.join([COMMONCRAWL_S3_URL, filepath])
 
@@ -26,8 +29,12 @@ def make_query_url(crawl, url):
 def get_location(session, url, crawl):
     """ Returns success and location """
     query_url = make_query_url(crawl, url)
-    r = session.get(query_url)
-    result = r.json()
+    try:
+        r = session.get(query_url)
+        result = r.json()
+    except:
+        return False, None
+
     try:
         data = {
             "filename": make_full_filename(result["filename"]),
@@ -54,34 +61,45 @@ def report_error(url, crawl, errors, total):
                         help='file containing candidates')
     parser.add_argument('outfile', type=argparse.FileType('w'),
                         default=sys.stdout)
-    parser.add_argument('-slang', help='source language (e.g. en)',
-                        default='en')
-    parser.add_argument('-tlang', help='source language (e.g. it)',
-                        default='it')
+    parser.add_argument('-kv', help='input is a .kv.gz file',
+                        default=False, action="store_true")
     args = parser.parse_args(sys.argv[1:])
 
     total_lines, total_errors = 0, 0
     with requests.Session() as session:
         for line in args.candidates:
             total_lines += 1
             line = line.decode("utf-8")
-            _, src_url, src_crawl, tgt_url, tgt_crawl = line.strip().split()
+            if args.kv:
+                # Lines have the format:
+                # {domain} {url} {crawl}\t{language_data}
+                url_data, _ = line.strip().split('\t')
+                _, src_url, src_crawl = url_data.strip().split()
+                tgt_success = False
+            else:
+                # Lines have the format:
+                # {stripped_url} {src_url} {src_crawl} {tgt_url} {tgt_crawl}
+                _, src_url, src_crawl, tgt_url, tgt_crawl = line.strip().split()
 
             src_success, src_loc = get_location(session, src_url, src_crawl)
             if not src_success:
                 total_errors += 1
                 report_error(src_url, src_crawl, total_errors, total_lines)
 
-            tgt_success, tgt_loc = get_location(session, tgt_url, tgt_crawl)
-            if not tgt_success:
-                total_errors += 1
-                report_error(tgt_url, tgt_crawl, total_errors, total_lines)
+            if not args.kv:
+                tgt_success, tgt_loc = get_location(session, tgt_url, tgt_crawl)
+                if not tgt_success:
+                    total_errors += 1
+                    report_error(tgt_url, tgt_crawl, total_errors, total_lines)
 
             if src_success and tgt_success:
                 args.outfile.write("%s\t%s\t%s\n" %
                                    (src_url, src_crawl, json.dumps(src_loc)))
                 args.outfile.write("%s\t%s\t%s\n" %
                                    (tgt_url, tgt_crawl, json.dumps(tgt_loc)))
+            elif args.kv and src_success:
+                args.outfile.write("%s\t%s\t%s\n" %
+                                   (src_url, src_crawl, json.dumps(src_loc)))
 
     sys.stderr.write("Done: ")
     report_error(tgt_url, tgt_crawl, total_errors, total_lines)