first commit of working spiderpig

aaronpk · aaronpk · commit a78bcae88a17 · 2015-02-26T16:07:23.000-08:00
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+node_modules
+
diff --git a/README.md b/README.md
@@ -0,0 +1,37 @@
+Spiderpig
+=========
+
+Spiderpig is a website crawler designed to archive your own sites as flat HTML files. It works great for Wordpress or other dynamic sites, collapsing the site into a folder of HTML files.
+
+You will need to copy over any assets like JS, CSS and images for now, since it only looks for pages linked to in <a> tags.
+
+
+## Usage
+
+```
+./spider example.com
+```
+
+This will crawl example.com, look for any links in <a> tags, and download all pages it finds to a folder called "example.com".
+
+This does not download CSS files, JS or images, unless they are linked to in <a> tags.
+
+
+## License
+
+Copyright 2014 Esri, Inc
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+> http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+A copy of the license is available in the repository's [LICENSE.txt] file.
+
diff --git a/package.json b/package.json
@@ -0,0 +1,16 @@
+{
+  "name": "Spiderpig",
+  "version": "1.0.0",
+  "description": "",
+  "main": "spider.js",
+  "scripts": {
+    "test": "echo \"Error: no test specified\" && exit 1"
+  },
+  "author": "Aaron Parecki http://aaronparecki.com",
+  "license": "Apache 2.0",
+  "dependencies": {
+    "cheerio": "^0.18.0",
+    "fs-tools": "^0.2.11",
+    "request": "^2.53.0"
+  }
+}
diff --git a/spider.js b/spider.js
@@ -0,0 +1,168 @@
+#!/usr/bin/env node
+var cheerio = require('cheerio');
+var request = require('request');
+var fstools = require('fs-tools');
+var fs = require('fs');
+var url = require('url');
+var http = require('http');
+
+var host = process.argv[2];
+
+// TODO: make sure host is set
+
+var http_timeout = 15000;
+
+var base = "https://"+host+"/";
+var output_dir = "./"+host;
+
+base = "https://geoloqi.com/blog/2012/05/unleashing-the-next-generation-of-location-with-geoloqis-titanium-module/geoloqi-visual-trigger-editor-3/";
+
+var visited = {};
+var queue = [];
+var ready = true;
+var running = 0;
+var num_processed = 0;
+
+var finishing = 0;
+
+queue.push(base);
+loop();
+
+
+function loop() {
+  if(ready) {
+    if(queue.length > 0) {
+      console.log("Queue length: "+queue.length);
+      console.log("Total Processed: "+num_processed);
+      ready = false;
+      running++;
+      num_processed++;
+
+      // console.log("Currently Running: "+running);
+
+      process_link(queue.shift());
+      setTimeout(loop, 100);
+    } else {
+      finishing++;
+      if(finishing < 3) {
+        setTimeout(loop, http_timeout / 3);
+      } else {
+        console.log("Nothing left in the queue");
+      }
+    }
+  } else {
+    setTimeout(loop, 100);
+  }
+}
+
+
+
+function store_redirect(from, to) {
+  var redirects = fs.openSync(output_dir+"/.htaccess", "a");
+  fs.writeSync(redirects, from+" "+to+"\n");
+  fs.closeSync(redirects);
+}
+
+
+function process_link(current) {
+
+  if(visited[current] == true) {
+    // console.log("Already visited!");
+    ready = true;
+    running--;
+    return;
+  }
+
+  console.log("===============================");
+  console.log("Processing: " + current);
+
+  visited[current] = true;
+
+  request({
+    url: current,
+    timeout: http_timeout,
+    // pool: { maxSockets: 1 },
+    followRedirect: function(response) {
+      var redirect = url.parse(response.headers.location);
+      if(redirect.host == host) {
+        return true;
+      } else {
+        ready = true;
+        //running--;
+        return false;
+      }
+    }
+  }, function(error,response,body) {
+
+    if(error) {
+      console.log(error);
+      ready = true;
+      running--;
+    } else {
+
+      // Find out if we followed any redirects to get here
+      var redirect_from = current;
+      if(response.request.redirects.length > 0) {
+        // Write each redirect to the file
+        for(var i=0; i<response.request.redirects.length; i++) {
+          var r = response.request.redirects[i];
+          store_redirect(redirect_from, r.redirectUri);
+          redirect_from = r.redirectUri;
+          // Update the "current" URL to set it to the resulting URL
+          current = r.redirectUri;
+        }
+        console.log("Was redirected to: "+current);
+      }
+
+      var page_url = url.parse(current);
+
+      // Add a slash if the path is not a file (does not end in a slash and does not have a dot)
+      var components = page_url.path.split("/");
+      if(!page_url.path.match(/\/$/) && !components[components.length-1].match(/\./)) {
+        page_url.path += "/";
+      }
+
+      // Add "index.html" if the path ends in a slash
+      if(page_url.path.match(/\/$/)) {
+        page_url.path += "index.html";
+      }
+
+      // The path will now always end in a filename
+      // Split the path on / and remove the filename to create the directory
+      components = page_url.path.split("/");
+      var filename = components.pop();
+      var dirname = output_dir + "/" + components.join("/") + "/";
+      console.log("Filename: "+filename);
+      console.log("Directory: "+dirname);
+
+      fstools.mkdirSync(dirname);
+      fs.writeFileSync(dirname+filename, body, "utf8");
+
+      var $ = cheerio.load(body);
+      var links = $("a");
+
+      for(var i = 0; i < links.length; i++) {
+        var a = links[i];
+        var next_url = $(a).attr("href");
+        if(next_url) {
+          var parsed = url.parse(next_url);
+
+          if(parsed.host == null || parsed.host == host) {
+            // Ignore the query string since we can't do anything with it anyway
+            var resolved = url.resolve(base, (parsed.pathname ? parsed.pathname : "")); //+(parsed.search ? parsed.search : ""));
+            if(!visited[resolved] && queue.indexOf(resolved) == -1) {
+              console.log("queuing: "+resolved);
+              queue.push(resolved);
+            }
+          } else {
+            // console.log("skipping: "+next_url);
+          }
+        }
+      }
+      ready = true;
+      running--;
+    }
+  });
+
+}
+