File save as worker, bugfixes

karust · May 15, 2023 · 5cab328 · 5cab328
1 parent 8b4112a
commit 5cab328
Show file tree

Hide file tree

Showing 7 changed files with 112 additions and 68 deletions.
diff --git a/README.md b/README.md
@@ -1,10 +1,13 @@
 # Go Get Crawl
-**goGetCrawl** is a tool and library which help you download URLs and Files from popular Web Archives like [Common Crawl](http://commoncrawl.org) and [Wayback Machine](https://web.archive.org/). You can use it as a command line tool or import the solution into your Go project. 
+[![Go Report Card](https://goreportcard.com/badge/github.com/karust/goGetCrawl)](https://goreportcard.com/report/github.com/karust/goGetCrawl)
+[![Go Reference](https://pkg.go.dev/badge/github.com/karust/goGetCrawl.svg)](https://pkg.go.dev/github.com/karust/goGetCrawl)
+
+**GoGetCrawl** is a tool and package which help you download URLs and Files from popular Web Archives like [Common Crawl](http://commoncrawl.org) and [Wayback Machine](https://web.archive.org/). You can use it as a command line tool or import the solution into your Go project. 
 
 ## Installation
 ### Source
 ```
-go install github.com/karust/goGetCrawl
+go install github.com/karust/goGetCrawl@latest
 ```
 
 ### Docker
@@ -24,28 +27,31 @@ Check out the latest release if you need binary [here](https://github.com/karust
 gogetcrawl -h
 ```
 
-* Get URLs:
+#### Get URLs
 
-You can fetch multiple domains archive data, the flags will be applied to each. By default you'll get all results displayed in your terminal.
+* You can fetch multiple domains archive data, the flags will be applied to each. By default you'll get all results displayed in your terminal:
 ```
 gogetcrawl url *.example.com kamaloff.ru 
 ```
 
-To limit number of results, output to file and select only Wayback as source you can:
+* To limit number of results, output to file and select only Wayback as source you can:
 ```
 gogetcrawl url *.example.com kamaloff.ru --limit 10 --sources wb -o ./urls.txt
 ```
 
-* Download files:
-To download 10 `PDF` files to `./test` directory with 3 workers:
+#### Download files
+* To download 10 `PDF` files to `./test` directory with 3 workers:
 ```
 gogetcrawl download *.cia.gov/* --limit 10 -w 3 -d ./test -f "mimetype:application/pdf"
 ```
 
-### Library usage
-For both Wayback and Common crawl you can use `concurrent` and `non-concurrent` ways to interract with archives: 
+### Package usage
+```
+go get github.com/karust/goGetCrawl
+```
+*For both Wayback and Common crawl you can use `concurrent` and `non-concurrent` ways to interract with archives*
 #### Wayback
-* Get urls
+* **Get urls:**
 ```go
 package main
 
@@ -76,7 +82,7 @@ func main() {
 }
 ```
 
-* Get files
+* **Get files:**
 ```go
 // Get all status:200 HTML files 
 config := common.RequestConfig{
@@ -94,9 +100,9 @@ fmt.Println(string(file))
 ```
 
 #### Common Crawl
-To use Common crawl you just need to replace `wayback` module with `commoncrawl`. Let's use Common Crawl concurretly:
+*To use Common Crawl you just need to replace `wayback` module with `commoncrawl`. Let's use Common Crawl concurretly*
 
-* Get urls
+* **Get urls:**
 ```go
 cc, _ := commoncrawl.New(30, 3)
 
@@ -135,7 +141,7 @@ for {
 }
 ```
 
-* Get files
+* **Get files:**
 ```go
 config := common.RequestConfig{
 	URL:     "kamaloff.ru/*",
@@ -146,3 +152,6 @@ cc, _ := commoncrawl.New(15, 2)
 results, _ := wb.GetPages(config)
 file, err := cc.GetFile(results[0])
 ```
+
+## Bugs + Features
+If you have some issues/bugs or feature request, feel free to open an issue.
diff --git a/cmd/file.go b/cmd/file.go
@@ -1,10 +1,7 @@
 package cmd
 
 import (
-	"fmt"
 	"log"
-	"mime"
-	"net/url"
 	"os"
 	"path/filepath"
 	"sync"
@@ -17,7 +14,7 @@ import (
 type fileScenario struct {
 	finishedWorkers uint
 	outputDir       string
-	downloadRate    uint
+	downloadRate    float32
 }
 
 var fileScn = fileScenario{}
@@ -26,29 +23,29 @@ var fileCMD = &cobra.Command{
 	Use:     "file",
 	Aliases: []string{"download"},
 	Short:   "Download files located in web arhives for desired domains",
+	Args:    cobra.MatchAll(cobra.MinimumNArgs(1), cobra.OnlyValidArgs),
 	Run:     fileScn.spawnWorkers,
 }
 
-func (fs *fileScenario) worker(configs chan common.RequestConfig) {
+func (fs *fileScenario) worker(configs <-chan common.RequestConfig) {
 	for {
 		select {
 		case config, ok := <-configs:
 			if ok {
 				var wg sync.WaitGroup
 				for _, s := range sources {
+
 					wg.Add(1)
 					go func(s common.Source) {
 						defer wg.Done()
-						pages, err := s.GetPages(config)
-						if err != nil {
-							errors <- err
-							return
-						}
-						for _, p := range pages {
-							fs.saveFile(s, p)
-							time.Sleep(time.Second * time.Duration(fs.downloadRate))
-						}
+						s.FetchPages(config, results, errors)
 					}(s)
+
+					//wg.Add(1)
+					go func() {
+						//defer wg.Done()
+						common.SaveFiles(results, fs.outputDir, errors, fs.downloadRate)
+					}()
 				}
 				wg.Wait()
 			} else {
@@ -60,13 +57,16 @@ func (fs *fileScenario) worker(configs chan common.RequestConfig) {
 }
 
 func (fs *fileScenario) spawnWorkers(cmd *cobra.Command, args []string) {
-	err := os.MkdirAll(fs.outputDir, os.ModePerm)
+	fp, _ := filepath.Abs(fs.outputDir)
+	err := os.MkdirAll(fp, os.ModePerm)
 	if err != nil {
 		log.Fatalf("Cannot get access to '%v' dir: %v", fileScn.outputDir, err)
+	} else {
+		log.Printf("Setting '%v' as output directorty", fp)
 	}
 
-	initSources()
 	configs := getRequestConfigs(args)
+	initSources()
 
 	var wg sync.WaitGroup
 
@@ -101,31 +101,9 @@ func (fs *fileScenario) spawnWorkers(cmd *cobra.Command, args []string) {
 	close(results)
 }
 
-func (fs *fileScenario) saveFile(source common.Source, page *common.CdxResponse) {
-	file, err := source.GetFile(page)
-	if err != nil {
-		errors <- err
-		return
-	}
-
-	exts, _ := mime.ExtensionsByType(page.MimeType)
-	if exts == nil {
-		exts = []string{""}
-	}
-
-	filename := fmt.Sprintf("%v-%v-%v%v", page.Original, page.Timestamp, source.Name(), exts[0])
-	escapedFilename := url.QueryEscape(filename)
-	fullPath := filepath.Join(fs.outputDir, escapedFilename)
-
-	err = common.SaveFile(file, fullPath)
-	if err != nil {
-		errors <- err
-	}
-}
-
 func init() {
 	fileCMD.Flags().StringVarP(&fileScn.outputDir, "dir", "d", "", "Path to the output directory")
-	fileCMD.Flags().UintVarP(&fileScn.downloadRate, "rate", "", 5, "Download rate in seconds for each worker (thread)")
+	fileCMD.Flags().Float32VarP(&fileScn.downloadRate, "rate", "", 1.0, "Download rate in seconds for each worker (thread). Ex: 5, 1.5")
 	rootCmd.AddCommand(fileCMD)
 	fileCMD.MarkFlagRequired("dir")
 }
diff --git a/cmd/main.go b/cmd/main.go
@@ -4,6 +4,7 @@ import (
 	"fmt"
 	"io"
 	"log"
+	"mime"
 	"os"
 
 	"github.com/karust/goGetCrawl/common"
@@ -12,7 +13,7 @@ import (
 	"github.com/spf13/cobra"
 )
 
-const version = "0.2"
+const version = "1.1.0"
 
 var (
 	filters          []string
@@ -61,6 +62,10 @@ func initSources() {
 			sources = append(sources, wb)
 		}
 	}
+
+	if len(sources) == 0 {
+		log.Fatalf("No archive sources provided.")
+	}
 }
 
 // Prepare arvhive request configs
@@ -71,6 +76,14 @@ func getRequestConfigs(args []string) chan common.RequestConfig {
 		filters = append(filters, []string{"statuscode:200", "mimetype:text/html"}...)
 	}
 
+	for _, ext := range extensions {
+		mtype := mime.TypeByExtension("." + ext)
+		if mtype == "" {
+			log.Fatalln(fmt.Sprintf("No MIME type found for '%v', please use '--filter' with correlated MIME.", ext))
+		}
+		filters = append(filters, "mimetype:"+mtype)
+	}
+
 	for _, domain := range args {
 		config := common.RequestConfig{
 			URL:     domain,
@@ -116,8 +129,8 @@ func init() {
 	rootCmd.PersistentFlags().IntVarP(&maxTimeout, "timeout", "t", 30, `Max timeout of requests.`)
 	rootCmd.PersistentFlags().IntVarP(&maxRetries, "retries", "r", 3, `Max request retries."`)
 	rootCmd.PersistentFlags().UintVarP(&maxResults, "limit", "l", 0, `Max number of results to fetch."`)
-	rootCmd.PersistentFlags().UintVarP(&maxWorkers, "workers", "w", 4, `Max number of workers (threads) to use."`)
-	rootCmd.PersistentFlags().StringSliceVarP(&extensions, "ext", "e", []string{}, `Which extensions to collect. Example: --ext "pdf,doc,jpeg"`)
+	rootCmd.PersistentFlags().UintVarP(&maxWorkers, "workers", "w", 4, `Max number of workers (threads) to use. URL consumes 1 worker"`)
+	rootCmd.PersistentFlags().StringSliceVarP(&extensions, "ext", "e", []string{}, `Which extensions to collect. Example: --ext "pdf,xml,jpeg"`)
 	rootCmd.PersistentFlags().StringSliceVarP(&sourceNames, "sources", "s", []string{"wb", "cc"}, `Web archive sources to use. Example: --sources "wb" to use only the Wayback`)
 	rootCmd.PersistentFlags().BoolVarP(&isDefaultFilters, "default-filter", "", false, `Use default filters (statuscode:200", "mimetype:text/html).`)
 	rootCmd.PersistentFlags().BoolVarP(&isVerbose, "verbose", "v", false, `Use verbose output.`)

diff --git a/cmd/url.go b/cmd/url.go
@@ -22,6 +22,7 @@ var urlCMD = &cobra.Command{
 	Use:     "url",
 	Aliases: []string{"collect"},
 	Short:   "Collect URLs from web archives for desired domain",
+	Args:    cobra.MatchAll(cobra.MinimumNArgs(1), cobra.OnlyValidArgs),
 	Run:     urlScn.spawnWorkers,
 }
 
@@ -54,8 +55,8 @@ func (us *urlScenario) spawnWorkers(cmd *cobra.Command, args []string) {
 		log.Fatalf("Error obtaining output: %v", err)
 	}
 
-	initSources()
 	configs := getRequestConfigs(args)
+	initSources()
 
 	var wg sync.WaitGroup
 

diff --git a/common/common.go b/common/common.go
@@ -6,7 +6,9 @@ import (
 	"log"
 	"mime"
 	"net/http"
+	"net/url"
 	"os"
+	"path/filepath"
 	"strings"
 	"time"
 
@@ -31,6 +33,7 @@ type CdxResponse struct {
 	Length       string `json:"length,omitempty"`
 	StatusCode   string `json:"status,omitempty"`
 	Filename     string `json:"filename,omitempty"`
+	Source       Source
 }
 
 // Source of web archive data
@@ -141,6 +144,7 @@ func Get(url string, timeout int, maxRetries int) ([]byte, error) {
 	return nil, fmt.Errorf("Perfomed max retries, no result: %v", err)
 }
 
+// Save data using file fullpath
 func SaveFile(data []byte, path string) error {
 	err := os.WriteFile(path, data, 0644)
 	if err != nil {
@@ -150,6 +154,43 @@ func SaveFile(data []byte, path string) error {
 	return nil
 }
 
+// Save files from CDX Response channel into output directory
+func SaveFiles(results <-chan []*CdxResponse, outputDir string, errors chan error, downloadRate float32) {
+	log.Println("[SaveFiles] worker started:", outputDir)
+
+	for {
+		select {
+		case resBatch, ok := <-results:
+			if ok {
+				for _, res := range resBatch {
+					data, err := res.Source.GetFile(res)
+					if err != nil {
+						errors <- err
+						continue
+					}
+
+					exts, _ := mime.ExtensionsByType(res.MimeType)
+					if exts == nil {
+						exts = []string{""}
+					}
+
+					filename := fmt.Sprintf("%v-%v-%v%v", res.Original, res.Timestamp, res.Source.Name(), exts[0])
+					escapedFilename := url.QueryEscape(filename)
+					fullPath := filepath.Join(outputDir, escapedFilename)
+
+					err = SaveFile(data, fullPath)
+					if err != nil {
+						errors <- err
+					}
+
+					time.Sleep(time.Second * time.Duration(downloadRate))
+				}
+			}
+		}
+	}
+
+}
+
 func GetFileExtenstion(file *[]byte) (string, error) {
 	contentType := http.DetectContentType(*file)
 	contentType = strings.Split(contentType, ";")[0]

diff --git a/commoncrawl/commoncrawl.go b/commoncrawl/commoncrawl.go
@@ -107,6 +107,7 @@ func (cc *CommonCrawl) ParseResponse(resp []byte) ([]*common.CdxResponse, error)
 		if err := jsoniter.Unmarshal(line, &indexVal); err != nil {
 			return nil, fmt.Errorf("[ParseResponse] Cannot decode JSON line: %v. Response: %v", err, string(line))
 		}
+		indexVal.Source = cc
 		pages = append(pages, &indexVal)
 	}