diff --git a/.gitignore b/.gitignore index 8815dff..65c4e2c 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,7 @@ data *.out out.txt *.test + +.wrangler +node_modules +package-lock.json diff --git a/internal/pdf/lex.go b/internal/pdf/lex.go index defd32f..61bd0e9 100644 --- a/internal/pdf/lex.go +++ b/internal/pdf/lex.go @@ -82,7 +82,7 @@ func (b *buffer) errorf(format string, args ...interface{}) string { return fmt.Sprintf(format, args...) } -func (b *buffer) reload() (bool, error) { +func (b *buffer) reload() { n := cap(b.buf) - int(b.offset%int64(cap(b.buf))) n, err := b.r.Read(b.buf[:n]) if n == 0 && err != nil { @@ -90,29 +90,20 @@ func (b *buffer) reload() (bool, error) { b.pos = 0 if b.allowEOF && err == io.EOF { b.eof = true - return false, err + return } - fmt.Sprint(b.errorf("malformed PDF: reading at offset %d: %v", b.offset, err)) - return false, err + panic(b.errorf("malformed PDF: reading at offset %d: %v", b.offset, err)) } b.offset += int64(n) b.buf = b.buf[:n] b.pos = 0 - return true, err } -func (b *buffer) seekForward(offset int64) (err error) { +func (b *buffer) seekForward(offset int64) { for b.offset < offset { - rel, err := b.reload() - if err != nil { - return err - } - if !rel { - return err - } + b.reload() } b.pos = len(b.buf) - int(b.offset-offset) - return err } func (b *buffer) readOffset() int64 { diff --git a/internal/pdf/page.go b/internal/pdf/page.go index 37b1adc..492f53b 100644 --- a/internal/pdf/page.go +++ b/internal/pdf/page.go @@ -55,13 +55,21 @@ Search: } // NumPage returns the number of pages in the PDF file. -func (r *Reader) NumPage() int { - return int(r.Trailer().Key("Root").Key("Pages").Key("Count").Int64()) +func (r *Reader) NumPage() (n int, err error) { + defer func() { + if r := recover(); r != nil { + err = fmt.Errorf("failed to get number of pages: %v", r) + } + }() + return int(r.Trailer().Key("Root").Key("Pages").Key("Count").Int64()), nil } // GetPlainText returns all the text in the PDF file func (r *Reader) GetPlainText() (reader io.Reader, err error) { - pages := r.NumPage() + pages, err := r.NumPage() + if err != nil { + return nil, err + } var buf bytes.Buffer fonts := make(map[string]*Font) for i := 1; i <= pages; i++ { diff --git a/toukibo_parser_test.go b/toukibo_parser_test.go index 195f8f6..b087fb3 100644 --- a/toukibo_parser_test.go +++ b/toukibo_parser_test.go @@ -179,3 +179,13 @@ func TestToukiboParser(t *testing.T) { }) } } + +func TestBrokenToukibo(t *testing.T) { + for i := 1; i <= 3; i++ { + pdfFileName := fmt.Sprintf("testdata/broken/broken%d.pdf", i) + _, err := GetContentByPDFPath(pdfFileName) + if err == nil { + t.Fatal("should be error") + } + } +}