Skip to content

Commit a78bcae

Browse files
committed
first commit of working spiderpig
0 parents  commit a78bcae

File tree

4 files changed

+223
-0
lines changed

4 files changed

+223
-0
lines changed

.gitignore

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
node_modules
2+

README.md

+37
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
Spiderpig
2+
=========
3+
4+
Spiderpig is a website crawler designed to archive your own sites as flat HTML files. It works great for Wordpress or other dynamic sites, collapsing the site into a folder of HTML files.
5+
6+
You will need to copy over any assets like JS, CSS and images for now, since it only looks for pages linked to in <a> tags.
7+
8+
9+
## Usage
10+
11+
```
12+
./spider example.com
13+
```
14+
15+
This will crawl example.com, look for any links in <a> tags, and download all pages it finds to a folder called "example.com".
16+
17+
This does not download CSS files, JS or images, unless they are linked to in <a> tags.
18+
19+
20+
## License
21+
22+
Copyright 2014 Esri, Inc
23+
24+
Licensed under the Apache License, Version 2.0 (the "License");
25+
you may not use this file except in compliance with the License.
26+
You may obtain a copy of the License at
27+
28+
> http://www.apache.org/licenses/LICENSE-2.0
29+
30+
Unless required by applicable law or agreed to in writing, software
31+
distributed under the License is distributed on an "AS IS" BASIS,
32+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
33+
See the License for the specific language governing permissions and
34+
limitations under the License.
35+
36+
A copy of the license is available in the repository's [LICENSE.txt] file.
37+

package.json

+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
{
2+
"name": "Spiderpig",
3+
"version": "1.0.0",
4+
"description": "",
5+
"main": "spider.js",
6+
"scripts": {
7+
"test": "echo \"Error: no test specified\" && exit 1"
8+
},
9+
"author": "Aaron Parecki http://aaronparecki.com",
10+
"license": "Apache 2.0",
11+
"dependencies": {
12+
"cheerio": "^0.18.0",
13+
"fs-tools": "^0.2.11",
14+
"request": "^2.53.0"
15+
}
16+
}

spider.js

+168
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,168 @@
1+
#!/usr/bin/env node
2+
var cheerio = require('cheerio');
3+
var request = require('request');
4+
var fstools = require('fs-tools');
5+
var fs = require('fs');
6+
var url = require('url');
7+
var http = require('http');
8+
9+
var host = process.argv[2];
10+
11+
// TODO: make sure host is set
12+
13+
var http_timeout = 15000;
14+
15+
var base = "https://"+host+"/";
16+
var output_dir = "./"+host;
17+
18+
base = "https://geoloqi.com/blog/2012/05/unleashing-the-next-generation-of-location-with-geoloqis-titanium-module/geoloqi-visual-trigger-editor-3/";
19+
20+
var visited = {};
21+
var queue = [];
22+
var ready = true;
23+
var running = 0;
24+
var num_processed = 0;
25+
26+
var finishing = 0;
27+
28+
queue.push(base);
29+
loop();
30+
31+
32+
function loop() {
33+
if(ready) {
34+
if(queue.length > 0) {
35+
console.log("Queue length: "+queue.length);
36+
console.log("Total Processed: "+num_processed);
37+
ready = false;
38+
running++;
39+
num_processed++;
40+
41+
// console.log("Currently Running: "+running);
42+
43+
process_link(queue.shift());
44+
setTimeout(loop, 100);
45+
} else {
46+
finishing++;
47+
if(finishing < 3) {
48+
setTimeout(loop, http_timeout / 3);
49+
} else {
50+
console.log("Nothing left in the queue");
51+
}
52+
}
53+
} else {
54+
setTimeout(loop, 100);
55+
}
56+
}
57+
58+
59+
60+
function store_redirect(from, to) {
61+
var redirects = fs.openSync(output_dir+"/.htaccess", "a");
62+
fs.writeSync(redirects, from+" "+to+"\n");
63+
fs.closeSync(redirects);
64+
}
65+
66+
67+
function process_link(current) {
68+
69+
if(visited[current] == true) {
70+
// console.log("Already visited!");
71+
ready = true;
72+
running--;
73+
return;
74+
}
75+
76+
console.log("===============================");
77+
console.log("Processing: " + current);
78+
79+
visited[current] = true;
80+
81+
request({
82+
url: current,
83+
timeout: http_timeout,
84+
// pool: { maxSockets: 1 },
85+
followRedirect: function(response) {
86+
var redirect = url.parse(response.headers.location);
87+
if(redirect.host == host) {
88+
return true;
89+
} else {
90+
ready = true;
91+
//running--;
92+
return false;
93+
}
94+
}
95+
}, function(error,response,body) {
96+
97+
if(error) {
98+
console.log(error);
99+
ready = true;
100+
running--;
101+
} else {
102+
103+
// Find out if we followed any redirects to get here
104+
var redirect_from = current;
105+
if(response.request.redirects.length > 0) {
106+
// Write each redirect to the file
107+
for(var i=0; i<response.request.redirects.length; i++) {
108+
var r = response.request.redirects[i];
109+
store_redirect(redirect_from, r.redirectUri);
110+
redirect_from = r.redirectUri;
111+
// Update the "current" URL to set it to the resulting URL
112+
current = r.redirectUri;
113+
}
114+
console.log("Was redirected to: "+current);
115+
}
116+
117+
var page_url = url.parse(current);
118+
119+
// Add a slash if the path is not a file (does not end in a slash and does not have a dot)
120+
var components = page_url.path.split("/");
121+
if(!page_url.path.match(/\/$/) && !components[components.length-1].match(/\./)) {
122+
page_url.path += "/";
123+
}
124+
125+
// Add "index.html" if the path ends in a slash
126+
if(page_url.path.match(/\/$/)) {
127+
page_url.path += "index.html";
128+
}
129+
130+
// The path will now always end in a filename
131+
// Split the path on / and remove the filename to create the directory
132+
components = page_url.path.split("/");
133+
var filename = components.pop();
134+
var dirname = output_dir + "/" + components.join("/") + "/";
135+
console.log("Filename: "+filename);
136+
console.log("Directory: "+dirname);
137+
138+
fstools.mkdirSync(dirname);
139+
fs.writeFileSync(dirname+filename, body, "utf8");
140+
141+
var $ = cheerio.load(body);
142+
var links = $("a");
143+
144+
for(var i = 0; i < links.length; i++) {
145+
var a = links[i];
146+
var next_url = $(a).attr("href");
147+
if(next_url) {
148+
var parsed = url.parse(next_url);
149+
150+
if(parsed.host == null || parsed.host == host) {
151+
// Ignore the query string since we can't do anything with it anyway
152+
var resolved = url.resolve(base, (parsed.pathname ? parsed.pathname : "")); //+(parsed.search ? parsed.search : ""));
153+
if(!visited[resolved] && queue.indexOf(resolved) == -1) {
154+
console.log("queuing: "+resolved);
155+
queue.push(resolved);
156+
}
157+
} else {
158+
// console.log("skipping: "+next_url);
159+
}
160+
}
161+
}
162+
ready = true;
163+
running--;
164+
}
165+
});
166+
167+
}
168+

0 commit comments

Comments
 (0)