Skip to content

Commit

Permalink
File save as worker, bugfixes
Browse files Browse the repository at this point in the history
  • Loading branch information
karust committed May 15, 2023
1 parent 8b4112a commit 5cab328
Show file tree
Hide file tree
Showing 7 changed files with 112 additions and 68 deletions.
37 changes: 23 additions & 14 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
# Go Get Crawl
**goGetCrawl** is a tool and library which help you download URLs and Files from popular Web Archives like [Common Crawl](http://commoncrawl.org) and [Wayback Machine](https://web.archive.org/). You can use it as a command line tool or import the solution into your Go project.
[![Go Report Card](https://goreportcard.com/badge/github.com/karust/goGetCrawl)](https://goreportcard.com/report/github.com/karust/goGetCrawl)
[![Go Reference](https://pkg.go.dev/badge/github.com/karust/goGetCrawl.svg)](https://pkg.go.dev/github.com/karust/goGetCrawl)

**GoGetCrawl** is a tool and package which help you download URLs and Files from popular Web Archives like [Common Crawl](http://commoncrawl.org) and [Wayback Machine](https://web.archive.org/). You can use it as a command line tool or import the solution into your Go project.

## Installation
### Source
```
go install github.com/karust/goGetCrawl
go install github.com/karust/goGetCrawl@latest
```

### Docker
Expand All @@ -24,28 +27,31 @@ Check out the latest release if you need binary [here](https://github.com/karust
gogetcrawl -h
```

* Get URLs:
#### Get URLs

You can fetch multiple domains archive data, the flags will be applied to each. By default you'll get all results displayed in your terminal.
* You can fetch multiple domains archive data, the flags will be applied to each. By default you'll get all results displayed in your terminal:
```
gogetcrawl url *.example.com kamaloff.ru
```

To limit number of results, output to file and select only Wayback as source you can:
* To limit number of results, output to file and select only Wayback as source you can:
```
gogetcrawl url *.example.com kamaloff.ru --limit 10 --sources wb -o ./urls.txt
```

* Download files:
To download 10 `PDF` files to `./test` directory with 3 workers:
#### Download files
* To download 10 `PDF` files to `./test` directory with 3 workers:
```
gogetcrawl download *.cia.gov/* --limit 10 -w 3 -d ./test -f "mimetype:application/pdf"
```

### Library usage
For both Wayback and Common crawl you can use `concurrent` and `non-concurrent` ways to interract with archives:
### Package usage
```
go get github.com/karust/goGetCrawl
```
*For both Wayback and Common crawl you can use `concurrent` and `non-concurrent` ways to interract with archives*
#### Wayback
* Get urls
* **Get urls:**
```go
package main

Expand Down Expand Up @@ -76,7 +82,7 @@ func main() {
}
```

* Get files
* **Get files:**
```go
// Get all status:200 HTML files
config := common.RequestConfig{
Expand All @@ -94,9 +100,9 @@ fmt.Println(string(file))
```

#### Common Crawl
To use Common crawl you just need to replace `wayback` module with `commoncrawl`. Let's use Common Crawl concurretly:
*To use Common Crawl you just need to replace `wayback` module with `commoncrawl`. Let's use Common Crawl concurretly*

* Get urls
* **Get urls:**
```go
cc, _ := commoncrawl.New(30, 3)

Expand Down Expand Up @@ -135,7 +141,7 @@ for {
}
```

* Get files
* **Get files:**
```go
config := common.RequestConfig{
URL: "kamaloff.ru/*",
Expand All @@ -146,3 +152,6 @@ cc, _ := commoncrawl.New(15, 2)
results, _ := wb.GetPages(config)
file, err := cc.GetFile(results[0])
```

## Bugs + Features
If you have some issues/bugs or feature request, feel free to open an issue.
56 changes: 17 additions & 39 deletions cmd/file.go
Original file line number Diff line number Diff line change
@@ -1,10 +1,7 @@
package cmd

import (
"fmt"
"log"
"mime"
"net/url"
"os"
"path/filepath"
"sync"
Expand All @@ -17,7 +14,7 @@ import (
type fileScenario struct {
finishedWorkers uint
outputDir string
downloadRate uint
downloadRate float32
}

var fileScn = fileScenario{}
Expand All @@ -26,29 +23,29 @@ var fileCMD = &cobra.Command{
Use: "file",
Aliases: []string{"download"},
Short: "Download files located in web arhives for desired domains",
Args: cobra.MatchAll(cobra.MinimumNArgs(1), cobra.OnlyValidArgs),
Run: fileScn.spawnWorkers,
}

func (fs *fileScenario) worker(configs chan common.RequestConfig) {
func (fs *fileScenario) worker(configs <-chan common.RequestConfig) {
for {
select {
case config, ok := <-configs:
if ok {
var wg sync.WaitGroup
for _, s := range sources {

wg.Add(1)
go func(s common.Source) {
defer wg.Done()
pages, err := s.GetPages(config)
if err != nil {
errors <- err
return
}
for _, p := range pages {
fs.saveFile(s, p)
time.Sleep(time.Second * time.Duration(fs.downloadRate))
}
s.FetchPages(config, results, errors)
}(s)

//wg.Add(1)
go func() {
//defer wg.Done()
common.SaveFiles(results, fs.outputDir, errors, fs.downloadRate)
}()
}
wg.Wait()
} else {
Expand All @@ -60,13 +57,16 @@ func (fs *fileScenario) worker(configs chan common.RequestConfig) {
}

func (fs *fileScenario) spawnWorkers(cmd *cobra.Command, args []string) {
err := os.MkdirAll(fs.outputDir, os.ModePerm)
fp, _ := filepath.Abs(fs.outputDir)
err := os.MkdirAll(fp, os.ModePerm)
if err != nil {
log.Fatalf("Cannot get access to '%v' dir: %v", fileScn.outputDir, err)
} else {
log.Printf("Setting '%v' as output directorty", fp)
}

initSources()
configs := getRequestConfigs(args)
initSources()

var wg sync.WaitGroup

Expand Down Expand Up @@ -101,31 +101,9 @@ func (fs *fileScenario) spawnWorkers(cmd *cobra.Command, args []string) {
close(results)
}

func (fs *fileScenario) saveFile(source common.Source, page *common.CdxResponse) {
file, err := source.GetFile(page)
if err != nil {
errors <- err
return
}

exts, _ := mime.ExtensionsByType(page.MimeType)
if exts == nil {
exts = []string{""}
}

filename := fmt.Sprintf("%v-%v-%v%v", page.Original, page.Timestamp, source.Name(), exts[0])
escapedFilename := url.QueryEscape(filename)
fullPath := filepath.Join(fs.outputDir, escapedFilename)

err = common.SaveFile(file, fullPath)
if err != nil {
errors <- err
}
}

func init() {
fileCMD.Flags().StringVarP(&fileScn.outputDir, "dir", "d", "", "Path to the output directory")
fileCMD.Flags().UintVarP(&fileScn.downloadRate, "rate", "", 5, "Download rate in seconds for each worker (thread)")
fileCMD.Flags().Float32VarP(&fileScn.downloadRate, "rate", "", 1.0, "Download rate in seconds for each worker (thread). Ex: 5, 1.5")
rootCmd.AddCommand(fileCMD)
fileCMD.MarkFlagRequired("dir")
}
19 changes: 16 additions & 3 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"fmt"
"io"
"log"
"mime"
"os"

"github.com/karust/goGetCrawl/common"
Expand All @@ -12,7 +13,7 @@ import (
"github.com/spf13/cobra"
)

const version = "0.2"
const version = "1.1.0"

var (
filters []string
Expand Down Expand Up @@ -61,6 +62,10 @@ func initSources() {
sources = append(sources, wb)
}
}

if len(sources) == 0 {
log.Fatalf("No archive sources provided.")
}
}

// Prepare arvhive request configs
Expand All @@ -71,6 +76,14 @@ func getRequestConfigs(args []string) chan common.RequestConfig {
filters = append(filters, []string{"statuscode:200", "mimetype:text/html"}...)
}

for _, ext := range extensions {
mtype := mime.TypeByExtension("." + ext)
if mtype == "" {
log.Fatalln(fmt.Sprintf("No MIME type found for '%v', please use '--filter' with correlated MIME.", ext))
}
filters = append(filters, "mimetype:"+mtype)
}

for _, domain := range args {
config := common.RequestConfig{
URL: domain,
Expand Down Expand Up @@ -116,8 +129,8 @@ func init() {
rootCmd.PersistentFlags().IntVarP(&maxTimeout, "timeout", "t", 30, `Max timeout of requests.`)
rootCmd.PersistentFlags().IntVarP(&maxRetries, "retries", "r", 3, `Max request retries."`)
rootCmd.PersistentFlags().UintVarP(&maxResults, "limit", "l", 0, `Max number of results to fetch."`)
rootCmd.PersistentFlags().UintVarP(&maxWorkers, "workers", "w", 4, `Max number of workers (threads) to use."`)
rootCmd.PersistentFlags().StringSliceVarP(&extensions, "ext", "e", []string{}, `Which extensions to collect. Example: --ext "pdf,doc,jpeg"`)
rootCmd.PersistentFlags().UintVarP(&maxWorkers, "workers", "w", 4, `Max number of workers (threads) to use. URL consumes 1 worker"`)
rootCmd.PersistentFlags().StringSliceVarP(&extensions, "ext", "e", []string{}, `Which extensions to collect. Example: --ext "pdf,xml,jpeg"`)
rootCmd.PersistentFlags().StringSliceVarP(&sourceNames, "sources", "s", []string{"wb", "cc"}, `Web archive sources to use. Example: --sources "wb" to use only the Wayback`)
rootCmd.PersistentFlags().BoolVarP(&isDefaultFilters, "default-filter", "", false, `Use default filters (statuscode:200", "mimetype:text/html).`)
rootCmd.PersistentFlags().BoolVarP(&isVerbose, "verbose", "v", false, `Use verbose output.`)
Expand Down
3 changes: 2 additions & 1 deletion cmd/url.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ var urlCMD = &cobra.Command{
Use: "url",
Aliases: []string{"collect"},
Short: "Collect URLs from web archives for desired domain",
Args: cobra.MatchAll(cobra.MinimumNArgs(1), cobra.OnlyValidArgs),
Run: urlScn.spawnWorkers,
}

Expand Down Expand Up @@ -54,8 +55,8 @@ func (us *urlScenario) spawnWorkers(cmd *cobra.Command, args []string) {
log.Fatalf("Error obtaining output: %v", err)
}

initSources()
configs := getRequestConfigs(args)
initSources()

var wg sync.WaitGroup

Expand Down
41 changes: 41 additions & 0 deletions common/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@ import (
"log"
"mime"
"net/http"
"net/url"
"os"
"path/filepath"
"strings"
"time"

Expand All @@ -31,6 +33,7 @@ type CdxResponse struct {
Length string `json:"length,omitempty"`
StatusCode string `json:"status,omitempty"`
Filename string `json:"filename,omitempty"`
Source Source
}

// Source of web archive data
Expand Down Expand Up @@ -141,6 +144,7 @@ func Get(url string, timeout int, maxRetries int) ([]byte, error) {
return nil, fmt.Errorf("Perfomed max retries, no result: %v", err)
}

// Save data using file fullpath
func SaveFile(data []byte, path string) error {
err := os.WriteFile(path, data, 0644)
if err != nil {
Expand All @@ -150,6 +154,43 @@ func SaveFile(data []byte, path string) error {
return nil
}

// Save files from CDX Response channel into output directory
func SaveFiles(results <-chan []*CdxResponse, outputDir string, errors chan error, downloadRate float32) {
log.Println("[SaveFiles] worker started:", outputDir)

for {
select {
case resBatch, ok := <-results:
if ok {
for _, res := range resBatch {
data, err := res.Source.GetFile(res)
if err != nil {
errors <- err
continue
}

exts, _ := mime.ExtensionsByType(res.MimeType)
if exts == nil {
exts = []string{""}
}

filename := fmt.Sprintf("%v-%v-%v%v", res.Original, res.Timestamp, res.Source.Name(), exts[0])
escapedFilename := url.QueryEscape(filename)
fullPath := filepath.Join(outputDir, escapedFilename)

err = SaveFile(data, fullPath)
if err != nil {
errors <- err
}

time.Sleep(time.Second * time.Duration(downloadRate))
}
}
}
}

}

func GetFileExtenstion(file *[]byte) (string, error) {
contentType := http.DetectContentType(*file)
contentType = strings.Split(contentType, ";")[0]
Expand Down
1 change: 1 addition & 0 deletions commoncrawl/commoncrawl.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ func (cc *CommonCrawl) ParseResponse(resp []byte) ([]*common.CdxResponse, error)
if err := jsoniter.Unmarshal(line, &indexVal); err != nil {
return nil, fmt.Errorf("[ParseResponse] Cannot decode JSON line: %v. Response: %v", err, string(line))
}
indexVal.Source = cc
pages = append(pages, &indexVal)
}

Expand Down
Loading

0 comments on commit 5cab328

Please sign in to comment.