Skip to content

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
bjesus committed Aug 31, 2024
0 parents commit 014d66d
Show file tree
Hide file tree
Showing 14 changed files with 715 additions and 0 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
.vscode
specs
personal
/erol
85 changes: 85 additions & 0 deletions cmd/erol/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
package main

import (
"fmt"
"log"
"os"

"github.com/urfave/cli/v2"

"github.com/bjesus/erol/common"
"github.com/bjesus/erol/internal/app"
"github.com/bjesus/erol/outputs"
)

func main() {
log.SetFlags(log.Lshortfile | log.Ltime)

app := &cli.App{
Name: "erol",
Usage: "Easy web scraping CLI tool",
Flags: []cli.Flag{
&cli.BoolFlag{
Name: "json",
Usage: "Output as JSON",
},
&cli.StringSliceFlag{
Name: "separator",
Usage: "Separator for text output (can be used multiple times)",
},
&cli.StringFlag{
Name: "template",
Usage: "Path to template file for output",
},
&cli.IntFlag{
Name: "max-pages",
Value: 3,
Usage: "Maximum number of pages to scrape",
},
},
Action: func(c *cli.Context) error {
if c.NArg() == 0 {
return fmt.Errorf("spec argument is required")
}
spec := c.Args().Get(0)
return runErol(c, spec)
},
}

if err := app.Run(os.Args); err != nil {
log.Fatal(err)
}
}

func runErol(c *cli.Context, specFile string) error {
jsonOutput := c.Bool("json")
separators := c.StringSlice("separator")
templateFile := c.String("template")
maxPages := c.Int("max-pages")

erol := &common.ErolApp{
MaxPages: maxPages,
Separator: separators,
}

log.Println("Parsing spec file:", specFile)
err := app.ParseSpecFile(erol, specFile)
if err != nil {
return fmt.Errorf("error parsing spec file: %w", err)
}

log.Println("Executing blocks")
err = app.ExecuteBlocks(erol)
if err != nil {
return fmt.Errorf("error executing blocks: %w", err)
}

log.Println("Generating output")
if jsonOutput {
return outputs.OutputJSON(erol)
} else if templateFile != "" {
return outputs.OutputTemplate(erol, templateFile)
} else {
return outputs.OutputText(erol)
}
}
15 changes: 15 additions & 0 deletions common/common.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
package common

type Block struct {
Type string
Command string
Queries []string
NextPage string
}

type ErolApp struct {
Blocks []Block
Data []interface{}
MaxPages int
Separator []string
}
26 changes: 26 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
module github.com/bjesus/erol

go 1.23.0

require (
github.com/PuerkitoBio/goquery v1.9.2
github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510
github.com/playwright-community/playwright-go v0.4501.1
github.com/tidwall/gjson v1.17.3
github.com/urfave/cli/v2 v2.27.4
)

require (
github.com/andybalholm/cascadia v1.3.2 // indirect
github.com/cpuguy83/go-md2man/v2 v2.0.4 // indirect
github.com/deckarep/golang-set/v2 v2.6.0 // indirect
github.com/go-jose/go-jose/v3 v3.0.3 // indirect
github.com/go-stack/stack v1.8.1 // indirect
github.com/russross/blackfriday/v2 v2.1.0 // indirect
github.com/tidwall/match v1.1.1 // indirect
github.com/tidwall/pretty v1.2.1 // indirect
github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 // indirect
go.uber.org/multierr v1.11.0 // indirect
golang.org/x/exp v0.0.0-20240823005443-9b4947da3948 // indirect
golang.org/x/net v0.24.0 // indirect
)
92 changes: 92 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
github.com/PuerkitoBio/goquery v1.9.2 h1:4/wZksC3KgkQw7SQgkKotmKljk0M6V8TUvA8Wb4yPeE=
github.com/PuerkitoBio/goquery v1.9.2/go.mod h1:GHPCaP0ODyyxqcNoFGYlAprUFH81NuRPd0GX3Zu2Mvk=
github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss=
github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU=
github.com/cpuguy83/go-md2man/v2 v2.0.4 h1:wfIWP927BUkWJb2NmU/kNDYIBTh/ziUX91+lVfRxZq4=
github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/deckarep/golang-set/v2 v2.6.0 h1:XfcQbWM1LlMB8BsJ8N9vW5ehnnPVIw0je80NsVHagjM=
github.com/deckarep/golang-set/v2 v2.6.0/go.mod h1:VAky9rY/yGXJOLEDv3OMci+7wtDpOF4IN+y82NBOac4=
github.com/go-jose/go-jose/v3 v3.0.3 h1:fFKWeig/irsp7XD2zBxvnmA/XaRWp5V3CBsZXJF7G7k=
github.com/go-jose/go-jose/v3 v3.0.3/go.mod h1:5b+7YgP7ZICgJDBdfjZaIt+H/9L9T/YQrVfLAMboGkQ=
github.com/go-stack/stack v1.8.1 h1:ntEHSVwIt7PNXNpgPmVfMrNhLtgjlmnZha2kOpuRiDw=
github.com/go-stack/stack v1.8.1/go.mod h1:dcoOX6HbPZSZptuspn9bctJ+N/CnF5gGygcUP3XYfe4=
github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510 h1:El6M4kTTCOh6aBiKaUGG7oYTSPP8MxqL4YI3kZKwcP4=
github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510/go.mod h1:pupxD2MaaD3pAXIBCelhxNneeOaAeabZDe5s4K6zSpQ=
github.com/mitchellh/go-ps v1.0.0 h1:i6ampVEEF4wQFF+bkYfwYgY+F/uYJDktmvLPf7qIgjc=
github.com/mitchellh/go-ps v1.0.0/go.mod h1:J4lOc8z8yJs6vUwklHw2XEIiT4z4C40KtWVN3nvg8Pg=
github.com/playwright-community/playwright-go v0.4501.1 h1:kz8SIfR6nEI8blk77nTVD0K5/i37QP5rY/o8a1fG+4c=
github.com/playwright-community/playwright-go v0.4501.1/go.mod h1:bpArn5TqNzmP0jroCgw4poSOG9gSeQg490iLqWAaa7w=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk=
github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
github.com/tidwall/gjson v1.17.3 h1:bwWLZU7icoKRG+C+0PNwIKC6FCJO/Q3p2pZvuP0jN94=
github.com/tidwall/gjson v1.17.3/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk=
github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA=
github.com/tidwall/match v1.1.1/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM=
github.com/tidwall/pretty v1.2.0/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU=
github.com/tidwall/pretty v1.2.1 h1:qjsOFOWWQl+N3RsoF5/ssm1pHmJJwhjlSbZ51I6wMl4=
github.com/tidwall/pretty v1.2.1/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU=
github.com/urfave/cli/v2 v2.27.4 h1:o1owoI+02Eb+K107p27wEX9Bb8eqIoZCfLXloLUSWJ8=
github.com/urfave/cli/v2 v2.27.4/go.mod h1:m4QzxcD2qpra4z7WhzEGn74WZLViBnMpb1ToCAKdGRQ=
github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 h1:gEOO8jv9F4OT7lGCjxCBTO/36wtF6j2nSip77qHd4x4=
github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1/go.mod h1:Ohn+xnUBiLI6FVj/9LpzZWtj1/D6lUovWYBkxHVV3aM=
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0=
go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU=
golang.org/x/exp v0.0.0-20240823005443-9b4947da3948 h1:kx6Ds3MlpiUHKj7syVnbp57++8WpuKPcR5yjLBjvLEA=
golang.org/x/exp v0.0.0-20240823005443-9b4947da3948/go.mod h1:akd2r19cwCdwSwWeIdzYQGa/EZZyqcOdwWiwj5L5eKQ=
golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns=
golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg=
golang.org/x/net v0.24.0 h1:1PcaxkF854Fu3+lvBIx5SYn9wRlBzzcnHZSiaFFAb0w=
golang.org/x/net v0.24.0/go.mod h1:2Q7sJY5mzlzWjKtYUEXSlBWCdyaioyXzRB2RtU8KVE8=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY=
golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo=
golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
132 changes: 132 additions & 0 deletions internal/app/app.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
package app

import (
"bufio"
"fmt"
"log"
"os"
"strings"

"github.com/PuerkitoBio/goquery"
"github.com/bjesus/erol/common"
"github.com/bjesus/erol/parsers"
"github.com/tidwall/gjson"
)

func ParseSpecFile(e *common.ErolApp, filename string) error {
file, err := os.Open(filename)
if err != nil {
return err
}
defer file.Close()

scanner := bufio.NewScanner(file)
var currentBlock *common.Block

for scanner.Scan() {
line := scanner.Text()

if line == "" {
if currentBlock != nil {
e.Blocks = append(e.Blocks, *currentBlock)
currentBlock = nil
}
continue
}

if strings.HasPrefix(line, "#") {
continue
}
if currentBlock == nil {
if strings.HasPrefix(line, "curl ") {
currentBlock = &common.Block{Type: "curl", Command: line}
} else if strings.HasPrefix(line, "playwright ") {
currentBlock = &common.Block{Type: "playwright", Command: line}
} else {
return fmt.Errorf("invalid block start: %s", line)
}
} else {
if strings.HasPrefix(line, "> ") {

currentBlock.NextPage = strings.TrimPrefix(line, "> ")
} else {
currentBlock.Queries = append(currentBlock.Queries, line)
}
}
}

log.Println("Found block", currentBlock)
if currentBlock != nil {
e.Blocks = append(e.Blocks, *currentBlock)
}

return scanner.Err()
}

func ExecuteBlocks(e *common.ErolApp) error {
for _, block := range e.Blocks {
var data interface{}
var err error

for page := 0; page < e.MaxPages; page++ {
if block.Type == "curl" {
data, err = parsers.ExecuteCurlBlock(block)
} else if block.Type == "playwright" {
data, err = parsers.ExecutePlaywrightBlock(block)
}

if err != nil {
return err
}

e.Data = append(e.Data, data)

if block.NextPage == "" {
break
}

nextURL, err := getNextPageURL(block, data)
if err != nil {
return err
}

block.Command = strings.Replace(block.Command, block.Command[strings.Index(block.Command, " ")+1:], nextURL, 1)
}
}

return nil
}

func getNextPageURL(block common.Block, data interface{}) (string, error) {
parts := strings.Split(block.NextPage, "|")
selector := strings.TrimSpace(parts[0])

var nextURL string

if block.Type == "curl" {
if strings.HasPrefix(selector, ".") {
// JSON mode
nextURL = gjson.Get(fmt.Sprintf("%v", data), selector).String()
} else {
// HTML mode
doc, err := goquery.NewDocumentFromReader(strings.NewReader(fmt.Sprintf("%v", data)))
if err != nil {
return "", err
}
nextURL, _ = doc.Find(selector).Attr("href")
}
} else if block.Type == "playwright" {
// TODO: Implement Playwright next page logic
return "", fmt.Errorf("playwright next page not implemented")
}

if len(parts) > 1 {
pipedURL, err := parsers.ExecutePipe(nextURL, strings.TrimSpace(parts[1]))
if err != nil {
return "", err
}
nextURL = strings.TrimSpace(pipedURL)
}

return nextURL, nil
}
17 changes: 17 additions & 0 deletions outputs/json.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
package outputs

import (
"encoding/json"
"log"

"github.com/bjesus/erol/common"
)

func OutputJSON(app *common.ErolApp) error {
jsonData, err := json.MarshalIndent(app.Data, "", " ")
if err != nil {
return err
}
log.Println(string(jsonData))
return nil
}
17 changes: 17 additions & 0 deletions outputs/template.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
package outputs

import (
"html/template"
"os"

"github.com/bjesus/erol/common"
)

func OutputTemplate(app *common.ErolApp, templateFile string) error {
tmpl, err := template.ParseFiles(templateFile)
if err != nil {
return err
}

return tmpl.Execute(os.Stdout, app.Data)
}
Loading

0 comments on commit 014d66d

Please sign in to comment.