Skip to content
This repository was archived by the owner on May 4, 2021. It is now read-only.

Commit 3ec4526

Browse files
authored
Merge pull request #11 from treigerm/dev
Add support for CommonCrawl Index Server API
2 parents 4308473 + 8d09dfa commit 3ec4526

File tree

2 files changed

+112
-0
lines changed

2 files changed

+112
-0
lines changed

baseline/baseline.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,10 +37,17 @@ nohup gzip -cd /mnt/langsplit/2015_32_kv.gz | ~/DataCollection/baseline/langstat
3737
If you are collecting data for a language direction for which you already earlier collected data from the reverse direction, please see an optimized process in the appendix.
3838

3939
## Step 3: Look up where these URLs appear in CommonCrawl S3
40+
41+
### Option 1 (if you have build your own location database)
4042
```
4143
nohup cat candidates.en-de | nice ~/DataCollection/baseline/locate_candidates.py - - -server='http://statmt.org:8084/query_prefix' > candidates.en-de.locations 2> locate.log &
4244
```
4345

46+
### Option 2 (use the [CommonCrawl Index API](http://commoncrawl.org/2015/04/announcing-the-common-crawl-index/))
47+
```
48+
nohup cat candidates.en-de | nice ~/DataCollection/baseline/locate_candidates_cc_index_api.py - - > candidates.en-de.locations 2> locate.log &
49+
```
50+
4451
## Step 4: Download pages from CommonCrawl S3 and extract text
4552
For certain language pairs we provide the `.locations` files in compressed form in our releases on https://github.com/ModernMT/DataCollection/releases. You can use these files to start the process in this step.
4653
```
Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
#!/usr/bin/env python
2+
# -*- coding: utf-8 -*-
3+
import sys
4+
import json
5+
import tldextract
6+
import re
7+
import requests
8+
import urllib
9+
10+
COMMONCRAWL_S3_URL = "https://commoncrawl.s3.amazonaws.com"
11+
COMMONCRAWL_INDEX_URL = "http://index.commoncrawl.org"
12+
13+
INVALID_URL = "123"
14+
INVALID_CRAWL = "abc"
15+
16+
def make_full_filename(filepath):
17+
return '/'.join([COMMONCRAWL_S3_URL, filepath])
18+
19+
def make_query_url(crawl, url):
20+
params = {
21+
"base_url": COMMONCRAWL_INDEX_URL,
22+
"crawl_id": crawl.replace('_', '-'),
23+
"url": urllib.quote(url, safe='') # Percent encode URL.
24+
}
25+
26+
query = "{base_url}/CC-MAIN-{crawl_id}-index?url={url}&output=json&limit=1"
27+
return query.format(**params)
28+
29+
def get_location(session, url, crawl):
30+
""" Returns success and location """
31+
query_url = make_query_url(crawl, url)
32+
try:
33+
r = session.get(query_url)
34+
result = r.json()
35+
except:
36+
return False, None
37+
38+
try:
39+
data = {
40+
"filename": make_full_filename(result["filename"]),
41+
"length": result["length"],
42+
"mime": result["mime"],
43+
"offset": result["offset"]
44+
}
45+
except KeyError:
46+
return False, None
47+
48+
return True, data
49+
50+
51+
def report_error(url, crawl, errors, total):
52+
percentage = 100. * errors / total
53+
sys.stderr.write("Errors: %d/%d = %.2f%%\t%s\t%s\n" %
54+
(errors, total, percentage, crawl, url))
55+
56+
57+
if __name__ == "__main__":
58+
import argparse
59+
parser = argparse.ArgumentParser()
60+
parser.add_argument('candidates', type=argparse.FileType('r'),
61+
help='file containing candidates')
62+
parser.add_argument('outfile', type=argparse.FileType('w'),
63+
default=sys.stdout)
64+
parser.add_argument('-kv', help='input is a .kv.gz file',
65+
default=False, action="store_true")
66+
args = parser.parse_args(sys.argv[1:])
67+
68+
total_lines, total_errors = 0, 0
69+
with requests.Session() as session:
70+
for line in args.candidates:
71+
total_lines += 1
72+
line = line.decode("utf-8")
73+
if args.kv:
74+
# Lines have the format:
75+
# {domain} {url} {crawl}\t{language_data}
76+
url_data, _ = line.strip().split('\t')
77+
_, src_url, src_crawl = url_data.strip().split()
78+
tgt_success = False
79+
else:
80+
# Lines have the format:
81+
# {stripped_url} {src_url} {src_crawl} {tgt_url} {tgt_crawl}
82+
_, src_url, src_crawl, tgt_url, tgt_crawl = line.strip().split()
83+
84+
src_success, src_loc = get_location(session, src_url, src_crawl)
85+
if not src_success:
86+
total_errors += 1
87+
report_error(src_url, src_crawl, total_errors, total_lines)
88+
89+
if not args.kv:
90+
tgt_success, tgt_loc = get_location(session, tgt_url, tgt_crawl)
91+
if not tgt_success:
92+
total_errors += 1
93+
report_error(tgt_url, tgt_crawl, total_errors, total_lines)
94+
95+
if src_success and tgt_success:
96+
args.outfile.write("%s\t%s\t%s\n" %
97+
(src_url, src_crawl, json.dumps(src_loc)))
98+
args.outfile.write("%s\t%s\t%s\n" %
99+
(tgt_url, tgt_crawl, json.dumps(tgt_loc)))
100+
elif args.kv and src_success:
101+
args.outfile.write("%s\t%s\t%s\n" %
102+
(src_url, src_crawl, json.dumps(src_loc)))
103+
104+
sys.stderr.write("Done: ")
105+
report_error(tgt_url, tgt_crawl, total_errors, total_lines)

0 commit comments

Comments
 (0)