diff --git a/.gitignore b/.gitignore index 65c4e2c..154c6f8 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,5 @@ out.txt .wrangler node_modules package-lock.json + +settings.local.json diff --git a/internal/pdf/errors.go b/internal/pdf/errors.go new file mode 100644 index 0000000..7d4b664 --- /dev/null +++ b/internal/pdf/errors.go @@ -0,0 +1,31 @@ +package pdf + +import "fmt" + +// PDFError represents errors that occur during PDF processing +type PDFError struct { + Op string // operation being performed + Offset int64 // file offset where error occurred + Message string // error message + Err error // underlying error, if any +} + +func (e *PDFError) Error() string { + if e.Err != nil { + return fmt.Sprintf("pdf: %s at offset %d: %s: %v", e.Op, e.Offset, e.Message, e.Err) + } + return fmt.Sprintf("pdf: %s at offset %d: %s", e.Op, e.Offset, e.Message) +} + +func (e *PDFError) Unwrap() error { + return e.Err +} + +// Common error types +var ( + ErrMalformedPDF = fmt.Errorf("malformed PDF") + ErrInvalidStream = fmt.Errorf("invalid stream") + ErrInvalidObject = fmt.Errorf("invalid object") + ErrMissingObject = fmt.Errorf("missing object") + ErrInvalidOperator = fmt.Errorf("invalid operator") +) \ No newline at end of file diff --git a/internal/pdf/lex.go b/internal/pdf/lex.go index 61bd0e9..d997911 100644 --- a/internal/pdf/lex.go +++ b/internal/pdf/lex.go @@ -46,6 +46,7 @@ type buffer struct { key []byte useAES bool objptr objptr + err error // last error encountered } // newBuffer returns a new buffer reading from r at the given offset. @@ -69,7 +70,7 @@ func (b *buffer) seek(offset int64) { func (b *buffer) readByte() byte { if b.pos >= len(b.buf) { b.reload() - if b.pos >= len(b.buf) { + if b.err != nil || b.pos >= len(b.buf) { return '\n' } } @@ -92,7 +93,13 @@ func (b *buffer) reload() { b.eof = true return } - panic(b.errorf("malformed PDF: reading at offset %d: %v", b.offset, err)) + b.err = &PDFError{ + Op: "reload", + Offset: b.offset, + Message: "reading failed", + Err: err, + } + return } b.offset += int64(n) b.buf = b.buf[:n] @@ -100,10 +107,12 @@ func (b *buffer) reload() { } func (b *buffer) seekForward(offset int64) { - for b.offset < offset { + for b.offset < offset && b.err == nil { b.reload() } - b.pos = len(b.buf) - int(b.offset-offset) + if b.err == nil { + b.pos = len(b.buf) - int(b.offset-offset) + } } func (b *buffer) readOffset() int64 { @@ -116,6 +125,10 @@ func (b *buffer) unreadByte() { } } +func (b *buffer) hasError() bool { + return b.err != nil +} + func (b *buffer) unreadToken(t token) { b.unread = append(b.unread, t) } @@ -131,12 +144,15 @@ func (b *buffer) readToken() token { c := b.readByte() for { if isSpace(c) { - if b.eof { + if b.eof || b.hasError() { return io.EOF } c = b.readByte() } else if c == '%' { for c != '\r' && c != '\n' { + if b.eof || b.hasError() { + return io.EOF + } c = b.readByte() } } else { @@ -183,6 +199,9 @@ func (b *buffer) readHexString() token { tmp := b.tmp[:0] for { Loop: + if b.eof || b.hasError() { + return b.errorf("unexpected EOF in hex string") + } c := b.readByte() if c == '>' { break @@ -191,6 +210,9 @@ func (b *buffer) readHexString() token { goto Loop } Loop2: + if b.eof || b.hasError() { + return b.errorf("unexpected EOF in hex string") + } c2 := b.readByte() if isSpace(c2) { goto Loop2 @@ -223,6 +245,9 @@ func (b *buffer) readLiteralString() token { depth := 1 Loop: for { + if b.eof || b.hasError() { + return b.errorf("unexpected EOF in literal string") + } c := b.readByte() switch c { default: @@ -284,13 +309,24 @@ Loop: func (b *buffer) readName() token { tmp := b.tmp[:0] for { + if b.eof || b.hasError() { + break + } c := b.readByte() if isDelim(c) || isSpace(c) { b.unreadByte() break } if c == '#' { - x := unhex(b.readByte())<<4 | unhex(b.readByte()) + if b.eof || b.hasError() { + return b.errorf("unexpected EOF in name escape") + } + b1 := b.readByte() + if b.eof || b.hasError() { + return b.errorf("unexpected EOF in name escape") + } + b2 := b.readByte() + x := unhex(b1)<<4 | unhex(b2) if x < 0 { // b.errorf("malformed name") fmt.Sprint(b.errorf("malformed name")) @@ -307,6 +343,9 @@ func (b *buffer) readName() token { func (b *buffer) readKeyword() token { tmp := b.tmp[:0] for { + if b.eof || b.hasError() { + break + } c := b.readByte() if isDelim(c) || isSpace(c) { b.unreadByte() @@ -411,7 +450,13 @@ type objdef struct { } func (b *buffer) readObject() (object, error) { + if b.hasError() { + return nil, b.err + } tok := b.readToken() + if b.hasError() { + return nil, b.err + } if kw, ok := tok.(keyword); ok { switch kw { case "null": @@ -473,6 +518,9 @@ func (b *buffer) readObject() (object, error) { func (b *buffer) readArray() object { var x array for { + if b.hasError() { + return b.err + } tok := b.readToken() if tok == nil || tok == keyword("]") { break @@ -490,6 +538,9 @@ func (b *buffer) readArray() object { func (b *buffer) readDict() object { x := make(dict) for { + if b.hasError() { + return b.err + } tok := b.readToken() if tok == nil || tok == keyword(">>") { break diff --git a/internal/pdf/page.go b/internal/pdf/page.go index 492f53b..3005bce 100644 --- a/internal/pdf/page.go +++ b/internal/pdf/page.go @@ -70,10 +70,16 @@ func (r *Reader) GetPlainText() (reader io.Reader, err error) { if err != nil { return nil, err } + if pages == 0 { + return nil, fmt.Errorf("PDF has no pages") + } var buf bytes.Buffer fonts := make(map[string]*Font) for i := 1; i <= pages; i++ { p := r.Page(i) + if p.V.IsNull() { + return nil, fmt.Errorf("failed to get page %d", i) + } for _, name := range p.Fonts() { // cache fonts so we don't continually parse charmap if _, ok := fonts[name]; !ok { f := p.Font(name) @@ -379,7 +385,8 @@ func readCmap(toUnicode Value) *cmap { n = int(stk.Pop().Int64()) case "endbfchar": if n < 0 { - panic("missing beginbfchar") + ok = false + return } for i := 0; i < n; i++ { repl, orig := stk.Pop().RawString(), stk.Pop().RawString() @@ -389,7 +396,8 @@ func readCmap(toUnicode Value) *cmap { n = int(stk.Pop().Int64()) case "endbfrange": if n < 0 { - panic("missing beginbfrange") + ok = false + return } for i := 0; i < n; i++ { dst, srcHi, srcLo := stk.Pop(), stk.Pop().RawString(), stk.Pop().RawString() @@ -492,10 +500,7 @@ func (p Page) GetPlainText(fonts map[string]*Font) (result string, err error) { var textBuilder bytes.Buffer showText := func(s string) { for _, ch := range enc.Decode(s) { - _, err := textBuilder.WriteRune(ch) - if err != nil { - panic(err) - } + textBuilder.WriteRune(ch) } } @@ -513,7 +518,7 @@ func (p Page) GetPlainText(fonts map[string]*Font) (result string, err error) { showText("\n") case "Tf": // set text font and size if len(args) != 2 { - panic("bad TL") + return // skip invalid operator } if font, ok := fonts[args[0].Name()]; ok { enc = font.Encoder() @@ -522,17 +527,17 @@ func (p Page) GetPlainText(fonts map[string]*Font) (result string, err error) { } case "\"": // set spacing, move to next line, and show text if len(args) != 3 { - panic("bad \" operator") + return // skip invalid operator } fallthrough case "'": // move to next line and show text if len(args) != 1 { - panic("bad ' operator") + return // skip invalid operator } fallthrough case "Tj": // show text if len(args) != 1 { - panic("bad Tj operator") + return // skip invalid operator } showText(args[0].RawString()) case "TJ": // show text, allowing individual glyph positioning @@ -573,10 +578,7 @@ func (p Page) GetTextByColumn() (Columns, error) { var textBuilder bytes.Buffer for _, ch := range enc.Decode(s) { - _, err := textBuilder.WriteRune(ch) - if err != nil { - panic(err) - } + textBuilder.WriteRune(ch) } text := Text{ S: textBuilder.String(), @@ -642,10 +644,7 @@ func (p Page) GetTextByRow() (Rows, error) { showText := func(enc TextEncoding, currentX, currentY float64, s string) { var textBuilder bytes.Buffer for _, ch := range enc.Decode(s) { - _, err := textBuilder.WriteRune(ch) - if err != nil { - panic(err) - } + textBuilder.WriteRune(ch) } // if DebugOn { @@ -720,7 +719,7 @@ func (p Page) walkTextBlocks(walker func(enc TextEncoding, x, y float64, s strin case "T*": // move to start of next line case "Tf": // set text font and size if len(args) != 2 { - panic("bad TL") + return // skip invalid operator } if font, ok := fonts[args[0].Name()]; ok { @@ -730,17 +729,17 @@ func (p Page) walkTextBlocks(walker func(enc TextEncoding, x, y float64, s strin } case "\"": // set spacing, move to next line, and show text if len(args) != 3 { - panic("bad \" operator") + return // skip invalid operator } fallthrough case "'": // move to next line and show text if len(args) != 1 { - panic("bad ' operator") + return // skip invalid operator } fallthrough case "Tj": // show text if len(args) != 1 { - panic("bad Tj operator") + return // skip invalid operator } walker(enc, currentX, currentY, args[0].RawString()) @@ -813,7 +812,7 @@ func (p Page) Content() Content { case "cm": // update g.CTM if len(args) != 6 { - panic("bad g.Tm") + return // skip invalid operator } var m matrix for i := 0; i < 6; i++ { @@ -841,7 +840,7 @@ func (p Page) Content() Content { case "re": // append rectangle to path if len(args) != 4 { - panic("bad re") + return // skip invalid operator } x, y, w, h := args[0].Float64(), args[1].Float64(), args[2].Float64(), args[3].Float64() rect = append(rect, Rect{Point{x, y}, Point{x + w, y + h}}) @@ -867,19 +866,19 @@ func (p Page) Content() Content { case "Tc": // set character spacing if len(args) != 1 { - panic("bad g.Tc") + return // skip invalid operator } g.Tc = args[0].Float64() case "TD": // move text position and set leading if len(args) != 2 { - panic("bad Td") + return // skip invalid operator } g.Tl = -args[1].Float64() fallthrough case "Td": // move text position if len(args) != 2 { - panic("bad Td") + return // skip invalid operator } tx := args[0].Float64() ty := args[1].Float64() @@ -889,7 +888,7 @@ func (p Page) Content() Content { case "Tf": // set text font and size if len(args) != 2 { - panic("bad TL") + return // skip invalid operator } f := args[0].Name() g.Tf = p.Font(f) @@ -901,7 +900,7 @@ func (p Page) Content() Content { case "\"": // set spacing, move to next line, and show text if len(args) != 3 { - panic("bad \" operator") + return // skip invalid operator } g.Tw = args[0].Float64() g.Tc = args[1].Float64() @@ -909,7 +908,7 @@ func (p Page) Content() Content { fallthrough case "'": // move to next line and show text if len(args) != 1 { - panic("bad ' operator") + return // skip invalid operator } x := matrix{{1, 0, 0}, {0, 1, 0}, {0, -g.Tl, 1}} g.Tlm = x.mul(g.Tlm) @@ -917,7 +916,7 @@ func (p Page) Content() Content { fallthrough case "Tj": // show text if len(args) != 1 { - panic("bad Tj operator") + return // skip invalid operator } showText(args[0].RawString()) @@ -942,13 +941,13 @@ func (p Page) Content() Content { case "TL": // set text leading if len(args) != 1 { - panic("bad TL") + return // skip invalid operator } g.Tl = args[0].Float64() case "Tm": // set text matrix and line matrix if len(args) != 6 { - panic("bad g.Tm") + return // skip invalid operator } var m matrix for i := 0; i < 6; i++ { @@ -960,25 +959,25 @@ func (p Page) Content() Content { case "Tr": // set text rendering mode if len(args) != 1 { - panic("bad Tr") + return // skip invalid operator } g.Tmode = int(args[0].Int64()) case "Ts": // set text rise if len(args) != 1 { - panic("bad Ts") + return // skip invalid operator } g.Trise = args[0].Float64() case "Tw": // set word spacing if len(args) != 1 { - panic("bad g.Tw") + return // skip invalid operator } g.Tw = args[0].Float64() case "Tz": // set horizontal text scaling if len(args) != 1 { - panic("bad Tz") + return // skip invalid operator } g.Th = args[0].Float64() / 100 } diff --git a/internal/pdf/ps.go b/internal/pdf/ps.go index c7ec20e..62e424a 100644 --- a/internal/pdf/ps.go +++ b/internal/pdf/ps.go @@ -85,31 +85,31 @@ Reading: continue case "currentdict": if len(dicts) == 0 { - panic("no current dictionary") + continue // skip - no current dictionary } stk.Push(Value{nil, objptr{}, dicts[len(dicts)-1]}) continue case "begin": d := stk.Pop() if d.Kind() != Dict { - panic("cannot begin non-dict") + continue // skip - cannot begin non-dict } dicts = append(dicts, d.data.(dict)) continue case "end": if len(dicts) <= 0 { - panic("mismatched begin/end") + continue // skip - mismatched begin/end } dicts = dicts[:len(dicts)-1] continue case "def": if len(dicts) <= 0 { - panic("def without open dict") + continue // skip - def without open dict } val := stk.Pop() key, ok := stk.Pop().data.(name) if !ok { - panic("def of non-name") + continue // skip - def of non-name } dicts[len(dicts)-1][key] = val.data continue diff --git a/internal/pdf/read.go b/internal/pdf/read.go index ee6a981..7abc1b1 100644 --- a/internal/pdf/read.go +++ b/internal/pdf/read.go @@ -145,6 +145,9 @@ func NewReaderEncrypted(f io.ReaderAt, size int64, pw func() string) (*Reader, e } pos := end - endChunk + int64(i) b := newBuffer(io.NewSectionReader(f, pos, end-pos), pos) + if b.hasError() { + return nil, b.err + } if b.readToken() != keyword("startxref") { return nil, fmt.Errorf("malformed PDF file: missing startxref") } @@ -192,7 +195,13 @@ func (r *Reader) Trailer() Value { } func readXref(r *Reader, b *buffer) ([]xref, objptr, dict, error) { + if b.hasError() { + return nil, objptr{}, nil, b.err + } tok := b.readToken() + if b.hasError() { + return nil, objptr{}, nil, b.err + } if tok == keyword("xref") { return readXrefTable(r, b) } @@ -408,7 +417,13 @@ func readXrefTable(r *Reader, b *buffer) ([]xref, objptr, dict, error) { func readXrefTableData(b *buffer, table []xref) ([]xref, error) { for { + if b.hasError() { + return nil, b.err + } tok := b.readToken() + if b.hasError() { + return nil, b.err + } if tok == keyword("trailer") { break } @@ -736,15 +751,15 @@ func (r *Reader) resolve(parent objptr, x interface{}) Value { Search: for { if strm.Kind() != Stream { - panic("not a stream") + return Value{} // not a stream } if strm.Key("Type").Name() != "ObjStm" { - panic("not an object stream") + return Value{} // not an object stream } n := int(strm.Key("N").Int64()) first := strm.Key("First").Int64() if first == 0 { - panic("missing First") + return Value{} // missing First } b := newBuffer(strm.Reader(), 0) b.allowEOF = true @@ -762,7 +777,7 @@ func (r *Reader) resolve(parent objptr, x interface{}) Value { } ext := strm.Key("Extends") if ext.Kind() != Stream { - panic("cannot find object in stream") + return Value{} // cannot find object in stream } strm = ext } @@ -776,11 +791,10 @@ func (r *Reader) resolve(parent objptr, x interface{}) Value { } def, ok := obj.(objdef) if !ok { - panic(fmt.Errorf("loading %v: found %T instead of objdef", ptr, obj)) - //return Value{} + return Value{} // loading error: found wrong type instead of objdef } if def.ptr != ptr { - panic(fmt.Errorf("loading %v: found %v", ptr, def.ptr)) + return Value{} // loading error: found wrong object pointer } x = def.obj } @@ -793,7 +807,6 @@ func (r *Reader) resolve(parent objptr, x interface{}) Value { case string: return Value{r, parent, x} default: - // panic(fmt.Errorf("unexpected value type %T in resolve", x)) fmt.Sprintf("unexpected value type %T in resolve", x) return Value{} } diff --git a/internal/toukibo/date_parser.go b/internal/toukibo/date_parser.go new file mode 100644 index 0000000..64510a4 --- /dev/null +++ b/internal/toukibo/date_parser.go @@ -0,0 +1,53 @@ +package toukibo + +import ( + "fmt" + "regexp" + "strings" +) + +// DateExtractor は日付抽出のための共通関数 +type DateExtractor struct { + suffixes []string + pattern *regexp.Regexp +} + +// NewDateExtractor は新しいDateExtractorを作成する +func NewDateExtractor(suffixes []string) *DateExtractor { + pattern := fmt.Sprintf(`([%s]+) *(%s)`, ZenkakuStringPattern, strings.Join(suffixes, "|")) + return &DateExtractor{ + suffixes: suffixes, + pattern: regexp.MustCompile(pattern), + } +} + +// Extract は与えられたテキストから日付を抽出する +func (de *DateExtractor) Extract(text string) (string, bool) { + matches := de.pattern.FindStringSubmatch(text) + if len(matches) > 0 { + return matches[1], true + } + return "", false +} + +// 共通の日付抽出関数 +var ( + // 登記日抽出 + RegisterDateExtractor = NewDateExtractor([]string{"登記"}) + // 退任日抽出 + ResignDateExtractor = NewDateExtractor([]string{"退任", "辞任"}) + // 就任日抽出 + AppointDateExtractor = NewDateExtractor([]string{"就任"}) + // 解散日抽出 + DissolvedDateExtractor = NewDateExtractor([]string{"解散"}) + // 破産日抽出 + BankruptDateExtractor = NewDateExtractor([]string{"破産手続開始"}) + // 継続日抽出 + ContinuedDateExtractor = NewDateExtractor([]string{"継続"}) +) + +// ExtractDateWithSuffix は指定されたサフィックスで日付を抽出する汎用関数 +func ExtractDateWithSuffix(text string, suffixes []string) (string, bool) { + extractor := NewDateExtractor(suffixes) + return extractor.Extract(text) +} \ No newline at end of file diff --git a/internal/toukibo/field_parser.go b/internal/toukibo/field_parser.go new file mode 100644 index 0000000..c734151 --- /dev/null +++ b/internal/toukibo/field_parser.go @@ -0,0 +1,153 @@ +package toukibo + +import "strings" + +// FieldProcessor はフィールド処理のインターフェース +type FieldProcessor interface { + // Matches はフィールドがこのプロセッサで処理可能かを判定する + Matches(s string) bool + // Process はフィールドを処理する + Process(h *HoujinBody, s string) bool +} + +// fieldProcessor は具体的なフィールド処理の実装 +type fieldProcessor struct { + matcher func(string) bool + process func(*HoujinBody, string) bool +} + +func (fp *fieldProcessor) Matches(s string) bool { + return fp.matcher(s) +} + +func (fp *fieldProcessor) Process(h *HoujinBody, s string) bool { + return fp.process(h, s) +} + +// NewFieldProcessor は新しいFieldProcessorを作成する +func NewFieldProcessor(matcher func(string) bool, process func(*HoujinBody, string) bool) FieldProcessor { + return &fieldProcessor{ + matcher: matcher, + process: process, + } +} + +// FieldProcessorRegistry はフィールドプロセッサのレジストリ +type FieldProcessorRegistry struct { + processors []FieldProcessor +} + +// NewFieldProcessorRegistry は新しいレジストリを作成する +func NewFieldProcessorRegistry() *FieldProcessorRegistry { + return &FieldProcessorRegistry{ + processors: []FieldProcessor{ + // 商号 + NewFieldProcessor( + func(s string) bool { + return strings.Contains(s, "商 号") || strings.Contains(s, "名 称") + }, + func(h *HoujinBody, s string) bool { + return h.processHoujinName(s) + }, + ), + // 本店 + NewFieldProcessor( + func(s string) bool { + return strings.Contains(s, "本 店") || strings.Contains(s, "主たる事務所") + }, + func(h *HoujinBody, s string) bool { + return h.processHoujinAddress(s) + }, + ), + // 資本金 + NewFieldProcessor( + func(s string) bool { + // ConsumeHoujinCapitalはインスタンスメソッドなので、別途定義 + return strings.HasPrefix(s, " ┃資本金の額") || strings.HasPrefix(s, " ┃特定資本金の額") || + strings.Contains(s, " ┃資本金") || + strings.Contains(s, "払込済出資総額") || strings.Contains(s, "出資の総額") || + strings.Contains(s, "資産の総額") || strings.Contains(s, "基本財産の総額") || strings.Contains(s, "特定資本の額") || + strings.Contains(s, "払い込んだ出資の") + }, + func(h *HoujinBody, s string) bool { + return h.processHoujinCapital(s) + }, + ), + // 発行済株式 + NewFieldProcessor( + func(s string) bool { + return strings.HasPrefix(s, " ┃発行済株式の総数") + }, + func(h *HoujinBody, s string) bool { + return h.processHoujinStock(s) + }, + ), + // 登記記録 + NewFieldProcessor( + func(s string) bool { + return strings.Contains(s, "登記記録に関する") + }, + func(h *HoujinBody, s string) bool { + // 登記記録はスキップするだけ + return true + }, + ), + // 役員 + NewFieldProcessor( + func(s string) bool { + return strings.Contains(s, "役員に関する事項") || strings.Contains(s, "社員に関する事項") + }, + func(h *HoujinBody, s string) bool { + // 役員情報は別途処理される + return true + }, + ), + // 会社成立日 + NewFieldProcessor( + func(s string) bool { + return strings.Contains(s, "会社成立の年月日") || strings.Contains(s, "法人成立の年月日") + }, + func(h *HoujinBody, s string) bool { + return h.ConsumeHoujinCreatedAt(s) + }, + ), + // 破産 + NewFieldProcessor( + func(s string) bool { + return strings.Contains(s, "┃破 産") + }, + func(h *HoujinBody, s string) bool { + return h.ConsumeHoujinBankruptedAt(s) + }, + ), + // 解散 + NewFieldProcessor( + func(s string) bool { + return strings.Contains(s, "┃解 散") + }, + func(h *HoujinBody, s string) bool { + return h.ConsumeHoujinDissolvedAt(s) + }, + ), + // 継続 + NewFieldProcessor( + func(s string) bool { + return strings.Contains(s, "┃会社継続") + }, + func(h *HoujinBody, s string) bool { + return h.ConsumeHoujinContinuedAt(s) + }, + ), + }, + } +} + +// Process は与えられた文字列を適切なプロセッサで処理する +func (r *FieldProcessorRegistry) Process(h *HoujinBody, s string) bool { + for _, processor := range r.processors { + if processor.Matches(s) { + return processor.Process(h, s) + } + } + return false +} \ No newline at end of file diff --git a/internal/toukibo/houjin_body.go b/internal/toukibo/houjin_body.go index ae3d182..7d5e14d 100644 --- a/internal/toukibo/houjin_body.go +++ b/internal/toukibo/houjin_body.go @@ -75,61 +75,32 @@ func (h *HoujinBody) GetHoujinRepresentatives() ([]HoujinExecutiveValue, error) return []HoujinExecutiveValue{}, fmt.Errorf("not found representative") } - var res []HoujinExecutiveValue // 代表清算人が代表となる場合 - for _, e := range h.HoujinExecutive { - for _, v := range e { - if (v.Position == "代表清算人") && v.IsValid { - res = append(res, v) - } - } - } + res := h.FindExecutivesByPosition("代表清算人") if len(res) > 0 { return res, nil } // 清算人が代表となる場合 - for _, e := range h.HoujinExecutive { - for _, v := range e { - if (v.Position == "清算人") && v.IsValid { - res = append(res, v) - } - } - } + res = h.FindExecutivesByPosition("清算人") if len(res) > 0 { return res, nil } // 破産管財人が代表となる場合 - for _, e := range h.HoujinExecutive { - for _, v := range e { - if (v.Position == "破産管財人") && v.IsValid { - res = append(res, v) - } - } - } + res = h.FindExecutivesByPosition("破産管財人") if len(res) > 0 { return res, nil } // 保全管財人が代表となる場合 // 今のところこのケースは見つけていないが、sample1047を見て必要だと判断 - for _, e := range h.HoujinExecutive { - for _, v := range e { - if (v.Position == "保全管財人") && v.IsValid { - res = append(res, v) - } - } + res = h.FindExecutivesByPosition("保全管財人") + if len(res) > 0 { + return res, nil } - for _, e := range h.HoujinExecutive { - for _, v := range e { - if (v.Position == "代表取締役" || v.Position == "代表理事" || v.Position == "代表社員" || v.Position == "会長" || - v.Position == "代表役員" || v.Position == "代表者" || v.Position == "理事長" || v.Position == "会頭" || - v.Position == "学長" || v.Position == "代表執行役") && v.IsValid { - res = append(res, v) - } - } - } + res = h.FindExecutivesByPosition("代表取締役", "代表理事", "代表社員", "会長", + "代表役員", "代表者", "理事長", "会頭", "学長", "代表執行役") if len(res) > 0 { return res, nil } @@ -138,65 +109,56 @@ func (h *HoujinBody) GetHoujinRepresentatives() ([]HoujinExecutiveValue, error) // 特定目的会社、有限会社は取締役が代表となる if houjinKaku == HoujinKakuYugen || houjinKaku == HoujinKakuTokuteiMokuteki { - var res []HoujinExecutiveValue - for _, e := range h.HoujinExecutive { - for _, v := range e { - if (v.Position == "取締役") && v.IsValid { - res = append(res, v) - } - } - } + res = h.FindExecutivesByPosition("取締役") if len(res) > 0 { return res, nil } } if houjinKaku == HoujinKakuGousi { - var res []HoujinExecutiveValue - for _, e := range h.HoujinExecutive { - for _, v := range e { - if (v.Position == "無限責任社員") && v.IsValid { - res = append(res, v) - } - } - } + res = h.FindExecutivesByPosition("無限責任社員") if len(res) > 0 { return res, nil } } // 理事が代表となる場合 - for _, e := range h.HoujinExecutive { - for _, v := range e { - if (v.Position == "理事") && v.IsValid { - res = append(res, v) - } - } - } + res = h.FindExecutivesByPosition("理事") if len(res) > 0 { return res, nil } // 監査役が代表となる場合 - for _, e := range h.HoujinExecutive { - for _, v := range e { - if (v.Position == "監査役") && v.IsValid { - res = append(res, v) - } - } - } + res = h.FindExecutivesByPosition("監査役") if len(res) > 0 { return res, nil } // 社員が代表となる場合 + res = h.FindExecutivesByPosition("社員") + if len(res) > 0 { + return res, nil + } + return []HoujinExecutiveValue{}, fmt.Errorf("not found representative") +} + +// FindExecutivesByPosition は指定された役職の役員を検索する共通関数 +func (h *HoujinBody) FindExecutivesByPosition(positions ...string) []HoujinExecutiveValue { + var res []HoujinExecutiveValue for _, e := range h.HoujinExecutive { for _, v := range e { - if (v.Position == "社員") && v.IsValid { + if v.IsValid && contains(positions, v.Position) { res = append(res, v) } } } - if len(res) > 0 { - return res, nil + return res +} + +// contains はスライスに要素が含まれているかチェックする +func contains(slice []string, item string) bool { + for _, s := range slice { + if s == item { + return true + } } - return []HoujinExecutiveValue{}, fmt.Errorf("not found representative") + return false } diff --git a/internal/toukibo/number_parser.go b/internal/toukibo/number_parser.go new file mode 100644 index 0000000..a2426d3 --- /dev/null +++ b/internal/toukibo/number_parser.go @@ -0,0 +1,77 @@ +package toukibo + +import ( + "regexp" + "strconv" + "strings" +) + +// UnitMultiplier は日本語の数値単位と倍率を表す +type UnitMultiplier struct { + Unit rune + Multiplier int +} + +// JapaneseNumberUnits は共通の日本語数値単位 +var JapaneseNumberUnits = []UnitMultiplier{ + {'兆', 1000000000000}, + {'億', 100000000}, + {'万', 10000}, +} + +// ParseJapaneseNumber は日本語の数値表記をパースする共通関数 +func ParseJapaneseNumber(s string, units []UnitMultiplier) (int, error) { + // 全角数字を半角に変換 + s = ZenkakuToHankaku(s) + + total := 0 + remaining := s + + // 各単位ごとに処理 + for _, unit := range units { + pattern := `(\d+)` + string(unit.Unit) + regex := regexp.MustCompile(pattern) + matches := regex.FindStringSubmatch(remaining) + + if len(matches) > 1 { + value, err := strconv.Atoi(matches[1]) + if err != nil { + return 0, err + } + total += value * unit.Multiplier + // 処理済み部分を削除 + remaining = strings.Replace(remaining, matches[0], "", 1) + } + } + + // 残りの数字(単位なし)を処理 + remaining = strings.TrimSpace(remaining) + if remaining != "" { + // 数字以外の文字を削除 + regex := regexp.MustCompile(`\d+`) + matches := regex.FindString(remaining) + if matches != "" { + value, err := strconv.Atoi(matches) + if err != nil { + return 0, err + } + total += value + } + } + + return total, nil +} + +// ParseJapaneseNumberWithSuffix は特定の接尾辞を持つ日本語数値をパースする +func ParseJapaneseNumberWithSuffix(s string, units []UnitMultiplier, suffix string) (int, error) { + // 接尾辞を削除 + s = strings.TrimSuffix(s, suffix) + s = strings.TrimSpace(s) + + // プレフィックス(金など)を削除 + if strings.HasPrefix(s, "金") { + s = strings.TrimPrefix(s, "金") + } + + return ParseJapaneseNumber(s, units) +} \ No newline at end of file diff --git a/internal/toukibo/parse_body.go b/internal/toukibo/parse_body.go index e1e0b63..53119e5 100644 --- a/internal/toukibo/parse_body.go +++ b/internal/toukibo/parse_body.go @@ -41,7 +41,7 @@ func getValue(s string) (string, error) { var value string var three []string // 登記日など - for _, l := range extractLines(s) { + for _, l := range ExtractLines(s) { _, remain := getPartOne(l) b, c := getPartTwo(remain) value += trimLeadingTrailingSpace(b) @@ -55,21 +55,18 @@ func getValue(s string) (string, error) { } func getRegisterAt(s string) (string, error) { - pattern := fmt.Sprintf(`([%s]+) *登記`, ZenkakuStringPattern) - regex := regexp.MustCompile(pattern) - matches := regex.FindStringSubmatch(s) - if len(matches) > 0 { - return trimAllSpace(matches[1]), nil + date, found := RegisterDateExtractor.Extract(s) + if found { + return trimAllSpace(date), nil } return "", fmt.Errorf("failed to get registerAt from %s", s) } func getResignedAt(s string) (string, error) { - pattern := fmt.Sprintf(`([%s]+) *(辞任|退任|死亡|抹消|廃止|解任|退社)`, ZenkakuStringPattern) - regex := regexp.MustCompile(pattern) - matches := regex.FindStringSubmatch(s) - if len(matches) > 0 { - return trimAllSpace(matches[1]), nil + // 辞任/退任の他に死亡、抹消、廃止、解任、退社も含める + date, found := ExtractDateWithSuffix(s, []string{"辞任", "退任", "死亡", "抹消", "廃止", "解任", "退社"}) + if found { + return trimAllSpace(date), nil } return "", fmt.Errorf("failed to get resignedAt from %s", s) } @@ -131,7 +128,7 @@ func normalizeExecutiveName(s string) string { func getMultipleExecutiveNamesAndPositions(s string) (result []struct{ Name, Position string }, three []string) { var onNameAndPos bool - for _, l := range extractLines(s) { + for _, l := range ExtractLines(s) { _, remain := getPartOne(l) b, c := getPartTwo(remain) three = append(three, c) @@ -180,7 +177,7 @@ func getMultipleExecutiveNamesAndPositions(s string) (result []struct{ Name, Pos } // ┃ * ┃ or ┃ * ┨ の中身を抽出 -func extractLines(s string) []string { +func ExtractLines(s string) []string { var res []string cur := "" for _, r := range s { @@ -224,6 +221,121 @@ func splitThree(s string) (string, string, string) { return trimLeadingTrailingSpace(partOne), trimLeadingTrailingSpace(partTwo), trimLeadingTrailingSpace(partThree) } +func extractExecutiveInfo(part string) ([]HoujinExecutiveValue, []string) { + posAndNames, three := getMultipleExecutiveNamesAndPositions(part) + evs := make([]HoujinExecutiveValue, 0, len(posAndNames)) + + for _, posAndName := range posAndNames { + ev := HoujinExecutiveValue{ + IsValid: true, + Name: posAndName.Name, + Position: posAndName.Position, + } + evs = append(evs, ev) + } + + return evs, three +} + +func extractDates(three []string) (registerAt, resignedAt string) { + for _, t := range three { + if at, _ := getRegisterAt(t); at != "" { + registerAt = at + } + if at, _ := getResignedAt(t); at != "" { + resignedAt = at + } + } + return +} + +func applyDatesToExecutives(evs []HoujinExecutiveValue, three []string) { + registerAt, resignedAt := extractDates(three) + + if registerAt != "" { + for i := range evs { + evs[i].RegisterAt = registerAt + } + } + if resignedAt != "" { + for i := range evs { + evs[i].ResignedAt = resignedAt + evs[i].IsValid = false + } + } +} + +func handlePreviousExecutiveRelation(evsArr HoujinExecutiveValueArray, idx int, + currentEvs []HoujinExecutiveValue, three []string, registerAt, resignedAt string) { + + if idx == 0 { + return + } + + if len(currentEvs) == 0 { + // 役員がないのに登記日がある場合は前の役員を対象にする + if registerAt != "" { + evsArr[idx-1].RegisterAt = registerAt + evsArr[idx-1].ResignedAt = resignedAt + evsArr[idx-1].IsValid = false + } + return + } + + if len(currentEvs) == 1 { + handleSingleExecutive(evsArr, idx, currentEvs[0], three) + } +} + +func handleSingleExecutive(evsArr HoujinExecutiveValueArray, idx int, + currentEv HoujinExecutiveValue, three []string) { + + prev := &evsArr[idx-1] + + // 同一役員チェック + if prev.Name == currentEv.Name && prev.Position == currentEv.Position { + prev.IsValid = false + return + } + + joinedThree := strings.Join(three, "") + + // 重任チェック + if strings.Contains(joinedThree, "重任") && prev.Position == currentEv.Position { + prev.IsValid = false + return + } + + // 氏名変更チェック + if isNameChange(joinedThree, prev.Name) { + prev.IsValid = false + return + } + + // 更正チェック + if strings.Contains(joinedThree, "更正") { + prev.IsValid = false + return + } +} + +func isNameChange(text, name string) bool { + // 氏名変更の各種パターンをチェック + patterns := []string{ + name + "の氏変更", + name + "の氏名変更", + name + "の名称変更", + name + "の名", + } + + for _, pattern := range patterns { + if strings.Contains(text, pattern) { + return true + } + } + return false +} + func GetHoujinExecutiveValue(s string) (HoujinExecutiveValueArray, error) { if DebugOn { PrintBar() @@ -236,93 +348,36 @@ func GetHoujinExecutiveValue(s string) (HoujinExecutiveValueArray, error) { var idx int for _, p := range parts { if DebugOn { - PrintSlice(extractLines(p)) - } - - var evs []HoujinExecutiveValue - - // 役員名、役職を取得 - posAndNames, three := getMultipleExecutiveNamesAndPositions(p) - for _, posAndName := range posAndNames { - ev := HoujinExecutiveValue{ - IsValid: true, - Name: posAndName.Name, - Position: posAndName.Position, - } - evs = append(evs, ev) - } - - // 登記日、辞任日を取得 - var registerAt, resignedAt string - for _, t := range three { - if at, _ := getRegisterAt(t); at != "" { - registerAt = at - } - if at, _ := getResignedAt(t); at != "" { - resignedAt = at - } - } - if registerAt != "" { - for i := range evs { - evs[i].RegisterAt = registerAt - } - } - if resignedAt != "" { - for i := range evs { - evs[i].ResignedAt = resignedAt - evs[i].IsValid = false - } + PrintSlice(ExtractLines(p)) } - if idx > 0 { - if len(evs) == 0 { - if registerAt != "" { - evsArr[idx-1].RegisterAt = registerAt - evsArr[idx-1].ResignedAt = resignedAt - evsArr[idx-1].IsValid = false - } - continue - } - - // 簡単のために、一つの欄に複数の役員情報が記載されている場合は無効処理等をスキップしている - if len(evs) == 1 { - // 同じ氏名、役職の役員が連続している場合、前の役員を無効にする - if evsArr[idx-1].Name == evs[0].Name && evsArr[idx-1].Position == evs[0].Position { - evsArr[idx-1].IsValid = false - } - - joinedThree := strings.Join(three, "") - if strings.Contains(joinedThree, "重任") && - evsArr[idx-1].Position == evs[0].Position { // sample788用のハック - evsArr[idx-1].IsValid = false - } - - // sample 30, 89, 106用のハック - // XXXXの氏/名称変更がある場合、その前の役員は無効にする - if strings.Contains(joinedThree, evsArr[idx-1].Name+"の氏変更") || - strings.Contains(joinedThree, evsArr[idx-1].Name+"の氏名変更") || - strings.Contains(joinedThree, evsArr[idx-1].Name+"の名称変更") || - strings.Contains(joinedThree, evsArr[idx-1].Name+"の名") { - evsArr[idx-1].IsValid = false - } - - // sample1385用のハック - // 平成29年 8月24日更正の場合、その前の役員は無効にする - if strings.Contains(joinedThree, "更正") { - evsArr[idx-1].IsValid = false - } - - } + // 役員情報の抽出 + evs, three := extractExecutiveInfo(p) + + // 日付情報の適用 + applyDatesToExecutives(evs, three) + + // 日付だけ抽出(前の役員処理用) + registerAt, resignedAt := extractDates(three) + + // 前の役員との関係処理 + handlePreviousExecutiveRelation(evsArr, idx, evs, three, registerAt, resignedAt) + + // 結果の更新 + if len(evs) > 0 { + idx += len(evs) + evsArr = append(evsArr, evs...) + } else if idx > 0 && registerAt != "" { + // 役員がないのに登記日がある場合は前の役員処理は上で完了しているので、ここではスキップ + continue } - - idx += len(evs) - evsArr = append(evsArr, evs...) } + if DebugOn { fmt.Println(evsArr) } + return evsArr, nil - } func (h *HoujinBody) ConsumeHoujinNumber(s string) bool { @@ -397,6 +452,7 @@ func (h *HoujinBody) ConsumeHoujinCreatedAt(s string) bool { } func (h *HoujinBody) ConsumeHoujinBankruptedAt(s string) bool { + // 破産日は特殊なフォーマットなので専用パターンを使う pattern := fmt.Sprintf("┃破 産 *│ *([%s]+日)([%s]*)", ZenkakuStringPattern, ZenkakuStringPattern) regex := regexp.MustCompile(pattern) @@ -414,7 +470,7 @@ func (h *HoujinBody) ConsumeHoujinDissolvedAt(s string) bool { } var s2 string - for _, p := range extractLines(s) { + for _, p := range ExtractLines(s) { _, b, _ := splitThree(p) s2 += b } @@ -433,7 +489,7 @@ func (h *HoujinBody) ConsumeHoujinDissolvedAt(s string) bool { } func (h *HoujinBody) ConsumeHoujinContinuedAt(s string) bool { - // ex 令和2年7月1日会社継続 + // 継続日も特殊なフォーマット pattern := fmt.Sprintf("┃会社継続 *│ *([%s]+日)会社継続", ZenkakuStringPattern) regex := regexp.MustCompile(pattern) @@ -445,15 +501,102 @@ func (h *HoujinBody) ConsumeHoujinContinuedAt(s string) bool { return false } +func (h *HoujinBody) shouldSkipField(s string) bool { + skipKeywords := []string{ + "発行可能株式総数", "┃目 的", "┃目的等", + "出資1口の金額", "出資の総口数", "出資払込の方法", + "株式の譲渡制限", "株券を発行する旨", + "取締役等の会社", "非業務執行取締役", + "取締役会設置会社", "監査役設置会社", "会計監査人設置会", + "解散の事由", "監査役会設置会社", + "地 区", "支 店", "従たる事務所", + } + + for _, keyword := range skipKeywords { + if strings.Contains(s, keyword) { + return true + } + } + return false +} + +func (h *HoujinBody) processHoujinName(s string) bool { + v, err := GetHoujinValue(s) + if err != nil { + return false + } + h.HoujinName = v + return true +} + +func (h *HoujinBody) processHoujinAddress(s string) bool { + v, err := GetHoujinValue(s) + if err != nil { + return false + } + h.HoujinAddress = v + return true +} + +func (h *HoujinBody) processHoujinCapital(s string) bool { + v, err := GetHoujinValue(s) + if err != nil { + return false + } + h.HoujinCapital = v + + // sample126, sample508用のハック + // 金0円(債務超過額 金2873万8754円)のような場合、資本金は金0円とする + for i, v := range h.HoujinCapital { + if strings.Contains(v.Value, "債務超過額") { + pattern := fmt.Sprintf("金([%s]+)円", ZenkakuNumberPattern) + regex := regexp.MustCompile(pattern) + matches := regex.FindStringSubmatch(v.Value) + if len(matches) > 0 { + h.HoujinCapital[i].Value = "金" + matches[1] + "円" + } + } + } + return true +} + +func (h *HoujinBody) processHoujinStock(s string) bool { + v, err := GetHoujinValue(s) + if err != nil { + return false + } + h.HoujinStock = v + return true +} + +func (h *HoujinBody) processHoujinToukiRecord(s string) bool { + v, err := GetHoujinValue(s) + if err != nil { + return false + } + h.HoujinToukiRecord = v + return true +} + +func (h *HoujinBody) processHoujinExecutive(s string) bool { + executives := splitExecutives(s) + h.HoujinExecutive = make([]HoujinExecutiveValueArray, len(executives)) + for i, e := range executives { + if strings.Contains(e, "監査役の監査の範囲を会計に関するものに限定") { + continue + } + + v, err := GetHoujinExecutiveValue(e) + if err != nil { + return false + } + h.HoujinExecutive[i] = v + } + return true +} + func (h *HoujinBody) ParseBodyMain(s string) error { - if strings.Contains(s, "発行可能株式総数") || strings.Contains(s, "┃目 的") || strings.Contains(s, "┃目的等") || - strings.Contains(s, "出資1口の金額") || strings.Contains(s, "出資の総口数") || strings.Contains(s, "出資払込の方法") || - strings.Contains(s, "株式の譲渡制限") || strings.Contains(s, "株券を発行する旨") || - strings.Contains(s, "取締役等の会社") || strings.Contains(s, "非業務執行取締役") || - strings.Contains(s, "取締役会設置会社") || strings.Contains(s, "監査役設置会社") || strings.Contains(s, "会計監査人設置会") || - strings.Contains(s, "解散の事由") || strings.Contains(s, "監査役会設置会社") || - strings.Contains(s, "地 区") || strings.Contains(s, "支 店") || strings.Contains(s, "従たる事務所") { - // skip + if h.shouldSkipField(s) { return nil } @@ -461,22 +604,18 @@ func (h *HoujinBody) ParseBodyMain(s string) error { return nil } + // レガシー互換性のため、Consume系メソッドを残しつつ、processメソッドも呼ぶ if h.ConsumeHoujinName(s) { - v, err := GetHoujinValue(s) - if err != nil { - return err + if !h.processHoujinName(s) { + return fmt.Errorf("failed to process houjin name") } - - h.HoujinName = v return nil } if h.ConsumeHoujinAddress(s) { - v, err := GetHoujinValue(s) - if err != nil { - return err + if !h.processHoujinAddress(s) { + return fmt.Errorf("failed to process houjin address") } - h.HoujinAddress = v return nil } if h.ConsumeHoujinKoukoku(s) { @@ -495,57 +634,27 @@ func (h *HoujinBody) ParseBodyMain(s string) error { return nil } if h.ConsumeHoujinCapital(s) { - v, err := GetHoujinValue(s) - if err != nil { - return err - } - h.HoujinCapital = v - - // sample126, sample508用のハック - // 金0円(債務超過額 金2873万8754円)のような場合、資本金は金0円とする - - for i, v := range h.HoujinCapital { - if strings.Contains(v.Value, "債務超過額") { - pattern := fmt.Sprintf("金([%s]+)円", ZenkakuNumberPattern) - regex := regexp.MustCompile(pattern) - matches := regex.FindStringSubmatch(v.Value) - if len(matches) > 0 { - h.HoujinCapital[i].Value = "金" + matches[1] + "円" - } - } + if !h.processHoujinCapital(s) { + return fmt.Errorf("failed to process houjin capital") } return nil } if h.ConsumeHoujinStock(s) { - v, err := GetHoujinValue(s) - if err != nil { - return err + if !h.processHoujinStock(s) { + return fmt.Errorf("failed to process houjin stock") } - h.HoujinStock = v return nil } if h.ConsumeHoujinToukiRecord(s) { - v, err := GetHoujinValue(s) - if err != nil { - return err + if !h.processHoujinToukiRecord(s) { + return fmt.Errorf("failed to process houjin touki record") } - h.HoujinToukiRecord = v return nil } if h.ConsumeHoujinExecutive(s) { - executives := splitExecutives(s) - h.HoujinExecutive = make([]HoujinExecutiveValueArray, len(executives)) - for i, e := range executives { - if strings.Contains(e, "監査役の監査の範囲を会計に関するものに限定") { - continue - } - - v, err := GetHoujinExecutiveValue(e) - if err != nil { - return err - } - h.HoujinExecutive[i] = v + if !h.processHoujinExecutive(s) { + return fmt.Errorf("failed to process houjin executive") } return nil } diff --git a/internal/toukibo/stock_converter.go b/internal/toukibo/stock_converter.go index 20fefa7..c22fcfb 100644 --- a/internal/toukibo/stock_converter.go +++ b/internal/toukibo/stock_converter.go @@ -7,38 +7,28 @@ import ( ) func GetStockNumber(s string) (int, string) { - sums := 0 - cur := 0 - foundKabu := false - for idx, v := range s { - if foundKabu { - return sums, s[idx:] - } - if v >= '0' && v <= '9' { - cur = cur*10 + int(v-'0') - continue - } - - switch v { - case '万': - sums += cur * 10000 - cur = 0 - case '億': - sums += cur * 100000000 - cur = 0 - case '兆': - sums += cur * 1000000000000 - cur = 0 - case '株': - sums += cur - cur = 0 // 意味はない - foundKabu = true - // 発行済株式の総数4万8249株各種の株式の数普通株式   3万249株A種優先株式 1万株B種優先株式 8000株 - // のようなパターンでは最初の株でReturnさせる - continue + // 株の位置を探す + kabuIndex := strings.Index(s, "株") + if kabuIndex == -1 { + // 株が見つからない場合 + num, err := ParseJapaneseNumber(s, JapaneseNumberUnits) + if err != nil { + return 0, "" } + return num, "" + } + + // 株までの部分を抽出 + numPart := s[:kabuIndex] + remaining := s[kabuIndex+len("株"):] + + // 数値をパース + num, err := ParseJapaneseNumber(numPart, JapaneseNumberUnits) + if err != nil { + return 0, remaining } - return sums, "" + + return num, remaining } func GetHoujinStock(stock string) HoujinStock { diff --git a/internal/toukibo/yen_converter.go b/internal/toukibo/yen_converter.go index 1d852dd..f491bbd 100644 --- a/internal/toukibo/yen_converter.go +++ b/internal/toukibo/yen_converter.go @@ -7,33 +7,11 @@ package toukibo // 上限は兆円まで func YenToNumber(yen string) int { - yen = ZenkakuToHankaku(yen) - - sums := 0 - cur := 0 - for _, v := range yen { - if v == '金' { - continue - } - if v >= '0' && v <= '9' { - cur = cur*10 + int(v-'0') - continue - } - - switch v { - case '万': - sums += cur * 10000 - cur = 0 - case '億': - sums += cur * 100000000 - cur = 0 - case '兆': - sums += cur * 1000000000000 - cur = 0 - case '円': - sums += cur - cur = 0 - } + // 新しい共通関数を使用 + result, err := ParseJapaneseNumberWithSuffix(yen, JapaneseNumberUnits, "円") + if err != nil { + // エラーの場合は0を返す(既存の動作を維持) + return 0 } - return sums + return result } diff --git a/toukibo_parser.go b/toukibo_parser.go index 08d8ad2..bbb52c8 100644 --- a/toukibo_parser.go +++ b/toukibo_parser.go @@ -2,6 +2,7 @@ package toukibo_parser import ( "bytes" + "fmt" "github.com/tychy/toukibo-parser/internal/pdf" "github.com/tychy/toukibo-parser/internal/toukibo" @@ -10,7 +11,7 @@ import ( func GetContentByPDFPath(path string) (string, error) { r, err := pdf.Open(path) if err != nil { - return "s", err + return "", err } var buf bytes.Buffer b, err := r.GetPlainText() @@ -21,7 +22,44 @@ func GetContentByPDFPath(path string) (string, error) { if err != nil { return "", err } - return buf.String(), nil + + content := buf.String() + // Check if content is valid (not empty and contains expected characters) + if len(content) == 0 { + return "", fmt.Errorf("PDF content is empty") + } + + // Check if content contains valid Japanese text or expected patterns + // Encrypted PDFs often produce garbled text + validChars := 0 + for _, r := range content { + if r >= 0x3040 && r <= 0x309F || // Hiragana + r >= 0x30A0 && r <= 0x30FF || // Katakana + r >= 0x4E00 && r <= 0x9FAF || // Kanji + r >= 0x20 && r <= 0x7E { // ASCII + validChars++ + } + } + + // If less than 50% of characters are valid, likely encrypted or corrupted + // Also check for specific patterns that indicate valid toukibo content + validRatio := float64(validChars) / float64(len([]rune(content))) + if validRatio < 0.5 { + // Check for common toukibo patterns + hasToukiboPattern := false + patterns := []string{"商 号", "本 店", "会社成立", "登記記録", "法人番号"} + for _, pattern := range patterns { + if bytes.Contains([]byte(content), []byte(pattern)) { + hasToukiboPattern = true + break + } + } + if !hasToukiboPattern { + return "", fmt.Errorf("PDF content appears to be encrypted or corrupted (valid ratio: %.2f)", validRatio) + } + } + + return content, nil } func ParseByPDFPath(path string) (*toukibo.Houjin, error) { diff --git a/toukibo_parser_test.go b/toukibo_parser_test.go index eaafb9e..23c7756 100644 --- a/toukibo_parser_test.go +++ b/toukibo_parser_test.go @@ -182,9 +182,18 @@ func TestToukiboParser(t *testing.T) { func TestBrokenToukibo(t *testing.T) { for i := 1; i <= 3; i++ { pdfFileName := fmt.Sprintf("testdata/broken/broken%d.pdf", i) - _, err := GetContentByPDFPath(pdfFileName) + content, err := GetContentByPDFPath(pdfFileName) + if err != nil { + t.Logf("broken%d.pdf: got error from GetContentByPDFPath: %v", i, err) + continue + } + + // If GetContentByPDFPath didn't return error, try parsing + _, err = toukibo.Parse(content) if err == nil { - t.Fatal("should be error") + t.Errorf("broken%d.pdf: expected error but got none, content length: %d", i, len(content)) + } else { + t.Logf("broken%d.pdf: got expected error from Parse: %v", i, err) } } }