yajnadevam · performance · Nov 11, 2024 · Nov 11, 2024
diff --git a/data/testcorpus.txt b/data/testcorpus.txt
@@ -78,7 +78,7 @@
     M-1206   .   cama(.)a
     M-403   .   a(.)tara
 023-as-spider:
-    M-955   j(..)a
+    M-955  .  j(..)a
     .   .   r(..)na
     .   .   (..)na
     L-106   23   (..)a

diff --git a/scripts/prove.pl b/scripts/prove.pl
@@ -1,46 +1,161 @@
-#!/usr/bin/perl -l
-use List::Util 'first';
-use Class::Struct;
+#!/usr/bin/env perl
 
-open(my $fh, '<', "mw.txt")
-   or die("Can't open mw.txt: $!\n");
+# Enable important safety and quality of life features
+use strict;      # Requires all variables to be declared
+use warnings;    # Enables helpful warning messages
+use utf8;        # Enable UTF-8 in source code
+use open qw(:std :utf8);  # Enable UTF-8 for input/output
+use feature 'say';  # Enables 'say' which is like print but adds newline automatically
 
-my @dict = <$fh>;
-chomp @dict;
-close $fh;
+# Import required modules
+use List::Util 'first'; # Provides useful list manipulation functions 
+use Getopt::Long;    # For command line argument processing
+use Pod::Usage;      # For documentation handling
+use Data::Dumper;  # Helpful for debugging - prints data structures
 
-$which = $ARGV[0];
+=head1 NAME
 
+ corpus_processor.pl - Process textual corpus data with named pattern matching
+
+=head1 SYNOPSIS
+
+    ./corpus_processor.pl [options] [target_character]
+
+    Options:
+        --dict-file=FILE    Dictionary file path (default: mw.txt)
+        --corpus-file=FILE  Corpus file path (default: ./data/testcorpus.txt)
+        --help             Show this help message
+        --man              Show full documentation
+
+    Examples:
+        ./corpus_processor.pl                    # Use default files and Process all characters
+        ./corpus_processor.pl A                  # Only process entries starting with 'A'
+        ./corpus_processor.pl --dict-file=custom.txt   # Use custom dictionary
+        ./corpus_processor.pl --corpus-file=new.txt    # Use custom corpus file
+        ./corpus_processor.pl --help             # Show help
+        ./corpus_processor.pl --man              # Show full manual
+
+
+=head1 DESCRIPTION
+
+    This script reads inscription data from a corpus file and matches patterns against 
+    a dictionary. It supports regex patterns and can filter processing by character.
+
+=cut
+
+# Global variables
 my %VALUES = ();
+my $target_char;    # Optional filter to only process entries starting with specific character
+my $dict_file;      # Dictionary file path
+my $corpus_file;    # Corpus file path
+
+# Process command line arguments
+sub process_args {
+    # Set defaults
+    $dict_file = "./mw.txt";
+    $corpus_file = "./data/testcorpus.txt";
+
+    # Get options
+    GetOptions(
+        "dict-file=s"   => \$dict_file,
+        "corpus-file=s" => \$corpus_file,
+        "help"          => sub { pod2usage(1) },
+        "man"           => sub { pod2usage(-exitstatus => 0, -verbose => 2) }
+    ) or pod2usage(2);
+
+    # Get optional target character
+    $target_char = shift @ARGV if @ARGV;
+}
+
+# Main execution block
+{
+    process_args();
+    my $dictionary = load_dictionary($dict_file);
+    my $readtexts = &read_corpus($corpus_file);
+    my @texts = @$readtexts;
+
+    for my $text (@texts) {
+        &process_text($text, $dictionary, $target_char);
+    }
+
+    # Print results in sorted order
+    print_results();
+}
+
+
+=head2 load_dictionary
 
-my $readtexts = &readtexts();
-my @texts = @$readtexts;
+Loads the dictionary file containing words to match against.
 
-for my $text (@texts) {
-    &resolve($text);
+Parameters:
+    $dict_filename - Path to the dictionary file
+
+Returns:
+    Array reference containing dictionary words
+
+Dies with error message if file cannot be opened.
+
+=cut
+
+sub load_dictionary {
+    my ($dict_filename) = @_;
+    open(my $fh, '<', $dict_filename) or die "Cannot open $dict_filename: $!\n";
+    my @dict = <$fh>;
+    chomp(@dict);
+    close $fh;
+    return \@dict;
 }
 
-for(sort keys %VALUES) {
-    print "$_ = @{$VALUES{$_}}"
+
+=head2 print_results
+
+Prints the final results in sorted order by character.
+
+Format:
+    character = option1 option2 option3 ...
+
+=cut
+
+sub print_results {
+    for my $char (sort keys %VALUES) {
+        say "$char = @{$VALUES{$char}}";
+    }
 }
 
-sub resolve {
-    my ($textset) = @_;
-    $char = shift @$textset;
-    if($which) { next unless($which && $char =~/^$which/);}
+=head2 process_text
+
+Processes a single text entry, matching patterns and storing results.
+
+Parameters:
+    $text_set   - Reference to array containing glyph data
+    $dictionary - Reference to array of dictionary words
+    target_char - Process only those entries starting with specific character
+
+The function matches patterns against dictionary words and stores
+results in the global %values hash.
+
+=cut
+
+
+sub process_text {
+    my ($textset, $dictionary, $target_char) = @_;
+    my $char = shift @$textset;
+    if($target_char) { next unless($target_char && $char =~/^$target_char/);}
     my @options = ();
     for my $lines (@$textset) {
       for my $text (@$lines) {
         my ($line) = @$text; 
         my ($cisi, $inscr, $re) = @$text;
-        print "$char $cisi INSCR $inscr REGEX $re";
+        say "$char $cisi INSCR $inscr REGEX $re";
 
         if(scalar @options > 0) {
             my $optset = "(". join("|", @options) . ")";
             $re =~ s/\(\..*?\)/$optset/;
         }
-        my @matches = grep { /^$re$/ } @dict;
-        print "$inscr $re matches @matches[0..9]";
+        my @matches = grep { /^$re$/ } @$dictionary;
+        # Get up to 10 matches safely
+        my @display_matches = $#matches > 9 ? @matches[0..9] : @matches;
+        say "$inscr $re matches @display_matches" . (" " x (9 - $#display_matches));  # Padding to match original ;
         @options = ();
         for(@matches) {
             if (/^$re$/) { push @options, "$1" }
@@ -62,9 +177,30 @@ sub resolve {
     $VALUES{$char} = \@options;
 }
 
-sub readtexts {
-    print "Reading corpus...";
-    open(my $fh, '<', "./data/testcorpus.txt") || die "Cannot open data/testcorpus.txt";
+
+=head2 read_corpus
+
+Reads and parses the corpus file containing inscription data.
+
+Parameters:
+    $corpus_filename - Path to the corpus file
+
+Returns:
+    Array reference containing parsed corpus data in the format:
+    [
+        [glyph_name, [[cisi, inscription, regex], ...]], 
+        [glyph_name, [[cisi, inscription, regex], ...]], 
+        ...
+    ]
+
+Dies with error message if file cannot be opened.
+
+=cut
+
+sub read_corpus {
+    my ($corpus_file) = @_;
+    say "Reading corpus...";
+    open(my $fh, '<', $corpus_file) || die "Cannot open data/testcorpus.txt";
     my @corpus = <$fh>;
     close $fh;
     chomp(@corpus);
@@ -83,13 +219,111 @@ sub readtexts {
         }
         else {
             s/^\s+|\s+$//g;
+            # Debug the line before split
+            # say "DEBUG: Processing line: '$_'\n";
             my ($cisi, $inscr, $regex) = split(/\s+/, $_);
+
+            # Validate we got all three fields
+            if (!defined $cisi || !defined $inscr || !defined $regex) {
+                say "WARNING: Invalid line format: '$_'\n";
+                say "  Got: cisi='", ($cisi//'<undef>'), 
+                      "' inscr='", ($inscr//'<undef>'), 
+                      "' regex='", ($regex//'<undef>'), "'\n";
+                next;  # Skip this line
+            }
+
             push @$glyphs, [$cisi, $inscr, $regex];
-            print "CISI $cisi INSCR $inscr REGEX $regex";
+            say "CISI $cisi INSCR $inscr REGEX $regex";
+            # s/^\s+|\s+$//g;
+            # my ($cisi, $inscr, $regex) = split(/\s+/, $_);
+            # push @$glyphs, [$cisi, $inscr, $regex];
+            # say "CISI $cisi INSCR $inscr REGEX $regex";
         }
     }
     push(@$texts, [$glyph, $glyphs]) if $glyph;
-    print "Reading corpus...done";
+    say "Reading corpus...done";
 
     return $texts;
-}
+}
+
+
+=head1 DESCRIPTION
+
+Enhanced version of the corpus processing script that adds:
+- Documentation
+- Command line argument handling
+- Flexible file path configuration
+
+=head1 AUTHOR
+
+Original script by [@yajnadevam]
+
+=head1 CONTRIBUTORS
+
+Enhanced version with docs, comments and CLI  by [@performance]
+
+=head1 VERSION HISTORY
+
+=over 4
+
+=item v1.0 (Original)
+
+* Initial implementation
+
+=item v2.0 (Current)
+
+* Made file paths and target char configurable via command-line args
+* Added documentation
+* Improved error handling
+* Better code structure
+* Added UTF-8 support
+* Added a few debug logs
+
+=back
+
+=head1 LICENSE
+
+Same license as the original.
+
+=head1 CREDITS
+
+Based on the original corpus processing script developed by @yajnadevam.
+
+=cut
+
+__END__
+
+# Additional documentation:
+
+=head1 FILE FORMATS
+
+=head2 Dictionary File (mw.txt)
+
+One word per line, UTF-8 encoded.
+
+Example:
+    word1
+    word2
+    word3
+
+=head2 Corpus File (testcorpus.txt)
+
+Format:
+    CharacterName:
+    cisi inscription regex
+    cisi inscription regex
+    # comments are allowed
+
+    NextCharacter:
+    cisi inscription regex
+    ...
+
+=head1 DEBUGGING
+
+To enable detailed debugging, uncomment the following line near the top of the script:
+# use Data::Dumper;
+
+Then add debug statements like:
+print Dumper($variable);
+
+=cut