diff --git a/data/testcorpus.txt b/data/testcorpus.txt index 7864444..bec2adb 100644 --- a/data/testcorpus.txt +++ b/data/testcorpus.txt @@ -78,7 +78,7 @@ M-1206 . cama(.)a M-403 . a(.)tara 023-as-spider: - M-955 j(..)a + M-955 . j(..)a . . r(..)na . . (..)na L-106 23 (..)a diff --git a/scripts/prove.pl b/scripts/prove.pl index 1d7ec90..405bf71 100755 --- a/scripts/prove.pl +++ b/scripts/prove.pl @@ -1,46 +1,161 @@ -#!/usr/bin/perl -l -use List::Util 'first'; -use Class::Struct; +#!/usr/bin/env perl -open(my $fh, '<', "mw.txt") - or die("Can't open mw.txt: $!\n"); +# Enable important safety and quality of life features +use strict; # Requires all variables to be declared +use warnings; # Enables helpful warning messages +use utf8; # Enable UTF-8 in source code +use open qw(:std :utf8); # Enable UTF-8 for input/output +use feature 'say'; # Enables 'say' which is like print but adds newline automatically -my @dict = <$fh>; -chomp @dict; -close $fh; +# Import required modules +use List::Util 'first'; # Provides useful list manipulation functions +use Getopt::Long; # For command line argument processing +use Pod::Usage; # For documentation handling +use Data::Dumper; # Helpful for debugging - prints data structures -$which = $ARGV[0]; +=head1 NAME + corpus_processor.pl - Process textual corpus data with named pattern matching + +=head1 SYNOPSIS + + ./corpus_processor.pl [options] [target_character] + + Options: + --dict-file=FILE Dictionary file path (default: mw.txt) + --corpus-file=FILE Corpus file path (default: ./data/testcorpus.txt) + --help Show this help message + --man Show full documentation + + Examples: + ./corpus_processor.pl # Use default files and Process all characters + ./corpus_processor.pl A # Only process entries starting with 'A' + ./corpus_processor.pl --dict-file=custom.txt # Use custom dictionary + ./corpus_processor.pl --corpus-file=new.txt # Use custom corpus file + ./corpus_processor.pl --help # Show help + ./corpus_processor.pl --man # Show full manual + + +=head1 DESCRIPTION + + This script reads inscription data from a corpus file and matches patterns against + a dictionary. It supports regex patterns and can filter processing by character. + +=cut + +# Global variables my %VALUES = (); +my $target_char; # Optional filter to only process entries starting with specific character +my $dict_file; # Dictionary file path +my $corpus_file; # Corpus file path + +# Process command line arguments +sub process_args { + # Set defaults + $dict_file = "./mw.txt"; + $corpus_file = "./data/testcorpus.txt"; + + # Get options + GetOptions( + "dict-file=s" => \$dict_file, + "corpus-file=s" => \$corpus_file, + "help" => sub { pod2usage(1) }, + "man" => sub { pod2usage(-exitstatus => 0, -verbose => 2) } + ) or pod2usage(2); + + # Get optional target character + $target_char = shift @ARGV if @ARGV; +} + +# Main execution block +{ + process_args(); + my $dictionary = load_dictionary($dict_file); + my $readtexts = &read_corpus($corpus_file); + my @texts = @$readtexts; + + for my $text (@texts) { + &process_text($text, $dictionary, $target_char); + } + + # Print results in sorted order + print_results(); +} + + +=head2 load_dictionary -my $readtexts = &readtexts(); -my @texts = @$readtexts; +Loads the dictionary file containing words to match against. -for my $text (@texts) { - &resolve($text); +Parameters: + $dict_filename - Path to the dictionary file + +Returns: + Array reference containing dictionary words + +Dies with error message if file cannot be opened. + +=cut + +sub load_dictionary { + my ($dict_filename) = @_; + open(my $fh, '<', $dict_filename) or die "Cannot open $dict_filename: $!\n"; + my @dict = <$fh>; + chomp(@dict); + close $fh; + return \@dict; } -for(sort keys %VALUES) { - print "$_ = @{$VALUES{$_}}" + +=head2 print_results + +Prints the final results in sorted order by character. + +Format: + character = option1 option2 option3 ... + +=cut + +sub print_results { + for my $char (sort keys %VALUES) { + say "$char = @{$VALUES{$char}}"; + } } -sub resolve { - my ($textset) = @_; - $char = shift @$textset; - if($which) { next unless($which && $char =~/^$which/);} +=head2 process_text + +Processes a single text entry, matching patterns and storing results. + +Parameters: + $text_set - Reference to array containing glyph data + $dictionary - Reference to array of dictionary words + target_char - Process only those entries starting with specific character + +The function matches patterns against dictionary words and stores +results in the global %values hash. + +=cut + + +sub process_text { + my ($textset, $dictionary, $target_char) = @_; + my $char = shift @$textset; + if($target_char) { next unless($target_char && $char =~/^$target_char/);} my @options = (); for my $lines (@$textset) { for my $text (@$lines) { my ($line) = @$text; my ($cisi, $inscr, $re) = @$text; - print "$char $cisi INSCR $inscr REGEX $re"; + say "$char $cisi INSCR $inscr REGEX $re"; if(scalar @options > 0) { my $optset = "(". join("|", @options) . ")"; $re =~ s/\(\..*?\)/$optset/; } - my @matches = grep { /^$re$/ } @dict; - print "$inscr $re matches @matches[0..9]"; + my @matches = grep { /^$re$/ } @$dictionary; + # Get up to 10 matches safely + my @display_matches = $#matches > 9 ? @matches[0..9] : @matches; + say "$inscr $re matches @display_matches" . (" " x (9 - $#display_matches)); # Padding to match original ; @options = (); for(@matches) { if (/^$re$/) { push @options, "$1" } @@ -62,9 +177,30 @@ sub resolve { $VALUES{$char} = \@options; } -sub readtexts { - print "Reading corpus..."; - open(my $fh, '<', "./data/testcorpus.txt") || die "Cannot open data/testcorpus.txt"; + +=head2 read_corpus + +Reads and parses the corpus file containing inscription data. + +Parameters: + $corpus_filename - Path to the corpus file + +Returns: + Array reference containing parsed corpus data in the format: + [ + [glyph_name, [[cisi, inscription, regex], ...]], + [glyph_name, [[cisi, inscription, regex], ...]], + ... + ] + +Dies with error message if file cannot be opened. + +=cut + +sub read_corpus { + my ($corpus_file) = @_; + say "Reading corpus..."; + open(my $fh, '<', $corpus_file) || die "Cannot open data/testcorpus.txt"; my @corpus = <$fh>; close $fh; chomp(@corpus); @@ -83,13 +219,111 @@ sub readtexts { } else { s/^\s+|\s+$//g; + # Debug the line before split + # say "DEBUG: Processing line: '$_'\n"; my ($cisi, $inscr, $regex) = split(/\s+/, $_); + + # Validate we got all three fields + if (!defined $cisi || !defined $inscr || !defined $regex) { + say "WARNING: Invalid line format: '$_'\n"; + say " Got: cisi='", ($cisi//''), + "' inscr='", ($inscr//''), + "' regex='", ($regex//''), "'\n"; + next; # Skip this line + } + push @$glyphs, [$cisi, $inscr, $regex]; - print "CISI $cisi INSCR $inscr REGEX $regex"; + say "CISI $cisi INSCR $inscr REGEX $regex"; + # s/^\s+|\s+$//g; + # my ($cisi, $inscr, $regex) = split(/\s+/, $_); + # push @$glyphs, [$cisi, $inscr, $regex]; + # say "CISI $cisi INSCR $inscr REGEX $regex"; } } push(@$texts, [$glyph, $glyphs]) if $glyph; - print "Reading corpus...done"; + say "Reading corpus...done"; return $texts; -} \ No newline at end of file +} + + +=head1 DESCRIPTION + +Enhanced version of the corpus processing script that adds: +- Documentation +- Command line argument handling +- Flexible file path configuration + +=head1 AUTHOR + +Original script by [@yajnadevam] + +=head1 CONTRIBUTORS + +Enhanced version with docs, comments and CLI by [@performance] + +=head1 VERSION HISTORY + +=over 4 + +=item v1.0 (Original) + +* Initial implementation + +=item v2.0 (Current) + +* Made file paths and target char configurable via command-line args +* Added documentation +* Improved error handling +* Better code structure +* Added UTF-8 support +* Added a few debug logs + +=back + +=head1 LICENSE + +Same license as the original. + +=head1 CREDITS + +Based on the original corpus processing script developed by @yajnadevam. + +=cut + +__END__ + +# Additional documentation: + +=head1 FILE FORMATS + +=head2 Dictionary File (mw.txt) + +One word per line, UTF-8 encoded. + +Example: + word1 + word2 + word3 + +=head2 Corpus File (testcorpus.txt) + +Format: + CharacterName: + cisi inscription regex + cisi inscription regex + # comments are allowed + + NextCharacter: + cisi inscription regex + ... + +=head1 DEBUGGING + +To enable detailed debugging, uncomment the following line near the top of the script: +# use Data::Dumper; + +Then add debug statements like: +print Dumper($variable); + +=cut \ No newline at end of file diff --git a/scripts/test_harness.pl b/scripts/test_harness.pl new file mode 100644 index 0000000..805ce77 --- /dev/null +++ b/scripts/test_harness.pl @@ -0,0 +1,151 @@ +#!/usr/bin/env perl + +use strict; +use warnings; +use Test::More; +use File::Temp qw(tempdir); +use File::Path qw(make_path); +use Capture::Tiny qw(capture); +use File::Copy; +use Cwd; + +# Store original directory +my $original_dir = getcwd(); +my $test_dir = tempdir(CLEANUP => 1); + +sub setup_test_env { + my ($dict_file, $corpus_file) = @_; + make_path("$test_dir/data"); + + my $abs_dict = File::Spec->rel2abs($dict_file); + my $abs_corpus = File::Spec->rel2abs($corpus_file); + + copy($abs_dict, "$test_dir/mw.txt") + or die "Failed to copy dictionary file: $!"; + copy($abs_corpus, "$test_dir/data/testcorpus.txt") + or die "Failed to copy corpus file: $!"; +} + +# Function to count and analyze sections +sub analyze_output { + my ($output) = @_; + my @sections; + my $current_section = ''; + my $current_content = ''; + + for my $line (split /\n/, $output) { + if ($line =~ /^(\d{3}-[^:]+):/) { + if ($current_section) { + push @sections, [$current_section, $current_content]; + } + $current_section = $1; + $current_content = "$line\n"; + } elsif ($current_section) { + $current_content .= "$line\n"; + } + } + + # Add last section + if ($current_section) { + push @sections, [$current_section, $current_content]; + } + + return \@sections; +} + +# Function to run a script with given arguments +sub run_script { + my ($script, $args) = @_; + my $abs_script = File::Spec->rel2abs($script, $original_dir); + + chdir $test_dir or die "Cannot change to test directory: $!"; + + my ($stdout, $stderr, $exit) = capture { + system($^X, $abs_script, split(/\s+/, $args)); + }; + + chdir $original_dir or die "Cannot change back to original directory: $!"; + + return ($stdout, $stderr, $exit); +} + +# Main test execution +sub main { + my ($dict_file, $corpus_file) = @_; + + die "Dictionary file not found: $dict_file" unless -f $dict_file; + die "Corpus file not found: $corpus_file" unless -f $corpus_file; + + print "Setting up test environment...\n"; + setup_test_env($dict_file, $corpus_file); + + print "Running original script...\n"; + my ($orig_stdout, $orig_stderr, $orig_exit) = run_script('orig_prove.pl', ''); + + print "Running new script...\n"; + my ($new_stdout, $new_stderr, $new_exit) = run_script('prove.pl', ''); + + # Analyze outputs + my $orig_sections = analyze_output($orig_stdout); + my $new_sections = analyze_output($new_stdout); + + print "Original sections: " . scalar(@$orig_sections) . "\n"; + print "New sections: " . scalar(@$new_sections) . "\n\n"; + + # Compare sections + my $max_sections = scalar(@$orig_sections) > scalar(@$new_sections) ? + scalar(@$orig_sections) : scalar(@$new_sections); + + for my $i (0 .. $max_sections - 1) { + if ($i < @$orig_sections && $i < @$new_sections) { + my $orig_section = $orig_sections->[$i]; + my $new_section = $new_sections->[$i]; + + if ($orig_section->[0] ne $new_section->[0]) { + print "Section name mismatch at position $i:\n"; + print "Original: $orig_section->[0]\n"; + print "New: $new_section->[0]\n\n"; + } + + if ($orig_section->[1] ne $new_section->[1]) { + print "Content mismatch in section $orig_section->[0]:\n"; + print "Original:\n$orig_section->[1]\n"; + print "New:\n$new_section->[1]\n"; + print "-" x 80 . "\n"; + } + } else { + print "Missing section at position $i\n"; + if ($i < @$orig_sections) { + print "Original has: $orig_sections->[$i]->[0]\n"; + } + if ($i < @$new_sections) { + print "New has: $new_sections->[$i]->[0]\n"; + } + print "\n"; + } + } + + # Save outputs for inspection + open(my $orig_fh, '>', "original_output.txt"); + print $orig_fh $orig_stdout; + close($orig_fh); + + open(my $new_fh, '>', "new_output.txt"); + print $new_fh $new_stdout; + close($new_fh); + + # Test output + + is($new_stdout, $orig_stdout, "stderr matches"); + is($new_stderr, $orig_stderr, "stderr matches"); + is($new_exit, $orig_exit, "exit code matches"); + + done_testing(); +} + +# Check command line arguments +unless (@ARGV == 2) { + die "Usage: $0 dictionary_file corpus_file\n"; +} + +main(@ARGV); \ No newline at end of file