Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion data/testcorpus.txt
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@
M-1206 . cama(.)a
M-403 . a(.)tara
023-as-spider:
M-955 j(..)a
M-955 . j(..)a
. . r(..)na
. . (..)na
L-106 23 (..)a
Expand Down
290 changes: 262 additions & 28 deletions scripts/prove.pl
Original file line number Diff line number Diff line change
@@ -1,46 +1,161 @@
#!/usr/bin/perl -l
use List::Util 'first';
use Class::Struct;
#!/usr/bin/env perl

open(my $fh, '<', "mw.txt")
or die("Can't open mw.txt: $!\n");
# Enable important safety and quality of life features
use strict; # Requires all variables to be declared
use warnings; # Enables helpful warning messages
use utf8; # Enable UTF-8 in source code
use open qw(:std :utf8); # Enable UTF-8 for input/output
use feature 'say'; # Enables 'say' which is like print but adds newline automatically

my @dict = <$fh>;
chomp @dict;
close $fh;
# Import required modules
use List::Util 'first'; # Provides useful list manipulation functions
use Getopt::Long; # For command line argument processing
use Pod::Usage; # For documentation handling
use Data::Dumper; # Helpful for debugging - prints data structures

$which = $ARGV[0];
=head1 NAME

corpus_processor.pl - Process textual corpus data with named pattern matching

=head1 SYNOPSIS

./corpus_processor.pl [options] [target_character]

Options:
--dict-file=FILE Dictionary file path (default: mw.txt)
--corpus-file=FILE Corpus file path (default: ./data/testcorpus.txt)
--help Show this help message
--man Show full documentation

Examples:
./corpus_processor.pl # Use default files and Process all characters
./corpus_processor.pl A # Only process entries starting with 'A'
./corpus_processor.pl --dict-file=custom.txt # Use custom dictionary
./corpus_processor.pl --corpus-file=new.txt # Use custom corpus file
./corpus_processor.pl --help # Show help
./corpus_processor.pl --man # Show full manual


=head1 DESCRIPTION

This script reads inscription data from a corpus file and matches patterns against
a dictionary. It supports regex patterns and can filter processing by character.

=cut

# Global variables
my %VALUES = ();
my $target_char; # Optional filter to only process entries starting with specific character
my $dict_file; # Dictionary file path
my $corpus_file; # Corpus file path

# Process command line arguments
sub process_args {
# Set defaults
$dict_file = "./mw.txt";
$corpus_file = "./data/testcorpus.txt";

# Get options
GetOptions(
"dict-file=s" => \$dict_file,
"corpus-file=s" => \$corpus_file,
"help" => sub { pod2usage(1) },
"man" => sub { pod2usage(-exitstatus => 0, -verbose => 2) }
) or pod2usage(2);

# Get optional target character
$target_char = shift @ARGV if @ARGV;
}

# Main execution block
{
process_args();
my $dictionary = load_dictionary($dict_file);
my $readtexts = &read_corpus($corpus_file);
my @texts = @$readtexts;

for my $text (@texts) {
&process_text($text, $dictionary, $target_char);
}

# Print results in sorted order
print_results();
}


=head2 load_dictionary

my $readtexts = &readtexts();
my @texts = @$readtexts;
Loads the dictionary file containing words to match against.

for my $text (@texts) {
&resolve($text);
Parameters:
$dict_filename - Path to the dictionary file

Returns:
Array reference containing dictionary words

Dies with error message if file cannot be opened.

=cut

sub load_dictionary {
my ($dict_filename) = @_;
open(my $fh, '<', $dict_filename) or die "Cannot open $dict_filename: $!\n";
my @dict = <$fh>;
chomp(@dict);
close $fh;
return \@dict;
}

for(sort keys %VALUES) {
print "$_ = @{$VALUES{$_}}"

=head2 print_results

Prints the final results in sorted order by character.

Format:
character = option1 option2 option3 ...

=cut

sub print_results {
for my $char (sort keys %VALUES) {
say "$char = @{$VALUES{$char}}";
}
}

sub resolve {
my ($textset) = @_;
$char = shift @$textset;
if($which) { next unless($which && $char =~/^$which/);}
=head2 process_text

Processes a single text entry, matching patterns and storing results.

Parameters:
$text_set - Reference to array containing glyph data
$dictionary - Reference to array of dictionary words
target_char - Process only those entries starting with specific character

The function matches patterns against dictionary words and stores
results in the global %values hash.

=cut


sub process_text {
my ($textset, $dictionary, $target_char) = @_;
my $char = shift @$textset;
if($target_char) { next unless($target_char && $char =~/^$target_char/);}
my @options = ();
for my $lines (@$textset) {
for my $text (@$lines) {
my ($line) = @$text;
my ($cisi, $inscr, $re) = @$text;
print "$char $cisi INSCR $inscr REGEX $re";
say "$char $cisi INSCR $inscr REGEX $re";

if(scalar @options > 0) {
my $optset = "(". join("|", @options) . ")";
$re =~ s/\(\..*?\)/$optset/;
}
my @matches = grep { /^$re$/ } @dict;
print "$inscr $re matches @matches[0..9]";
my @matches = grep { /^$re$/ } @$dictionary;
# Get up to 10 matches safely
my @display_matches = $#matches > 9 ? @matches[0..9] : @matches;
say "$inscr $re matches @display_matches" . (" " x (9 - $#display_matches)); # Padding to match original ;
@options = ();
for(@matches) {
if (/^$re$/) { push @options, "$1" }
Expand All @@ -62,9 +177,30 @@ sub resolve {
$VALUES{$char} = \@options;
}

sub readtexts {
print "Reading corpus...";
open(my $fh, '<', "./data/testcorpus.txt") || die "Cannot open data/testcorpus.txt";

=head2 read_corpus

Reads and parses the corpus file containing inscription data.

Parameters:
$corpus_filename - Path to the corpus file

Returns:
Array reference containing parsed corpus data in the format:
[
[glyph_name, [[cisi, inscription, regex], ...]],
[glyph_name, [[cisi, inscription, regex], ...]],
...
]

Dies with error message if file cannot be opened.

=cut

sub read_corpus {
my ($corpus_file) = @_;
say "Reading corpus...";
open(my $fh, '<', $corpus_file) || die "Cannot open data/testcorpus.txt";
my @corpus = <$fh>;
close $fh;
chomp(@corpus);
Expand All @@ -83,13 +219,111 @@ sub readtexts {
}
else {
s/^\s+|\s+$//g;
# Debug the line before split
# say "DEBUG: Processing line: '$_'\n";
my ($cisi, $inscr, $regex) = split(/\s+/, $_);

# Validate we got all three fields
if (!defined $cisi || !defined $inscr || !defined $regex) {
say "WARNING: Invalid line format: '$_'\n";
say " Got: cisi='", ($cisi//'<undef>'),
"' inscr='", ($inscr//'<undef>'),
"' regex='", ($regex//'<undef>'), "'\n";
next; # Skip this line
}

push @$glyphs, [$cisi, $inscr, $regex];
print "CISI $cisi INSCR $inscr REGEX $regex";
say "CISI $cisi INSCR $inscr REGEX $regex";
# s/^\s+|\s+$//g;
# my ($cisi, $inscr, $regex) = split(/\s+/, $_);
# push @$glyphs, [$cisi, $inscr, $regex];
# say "CISI $cisi INSCR $inscr REGEX $regex";
}
}
push(@$texts, [$glyph, $glyphs]) if $glyph;
print "Reading corpus...done";
say "Reading corpus...done";

return $texts;
}
}


=head1 DESCRIPTION

Enhanced version of the corpus processing script that adds:
- Documentation
- Command line argument handling
- Flexible file path configuration

=head1 AUTHOR

Original script by [@yajnadevam]

=head1 CONTRIBUTORS

Enhanced version with docs, comments and CLI by [@performance]

=head1 VERSION HISTORY

=over 4

=item v1.0 (Original)

* Initial implementation

=item v2.0 (Current)

* Made file paths and target char configurable via command-line args
* Added documentation
* Improved error handling
* Better code structure
* Added UTF-8 support
* Added a few debug logs

=back

=head1 LICENSE

Same license as the original.

=head1 CREDITS

Based on the original corpus processing script developed by @yajnadevam.

=cut

__END__

# Additional documentation:

=head1 FILE FORMATS

=head2 Dictionary File (mw.txt)

One word per line, UTF-8 encoded.

Example:
word1
word2
word3

=head2 Corpus File (testcorpus.txt)

Format:
CharacterName:
cisi inscription regex
cisi inscription regex
# comments are allowed

NextCharacter:
cisi inscription regex
...

=head1 DEBUGGING

To enable detailed debugging, uncomment the following line near the top of the script:
# use Data::Dumper;

Then add debug statements like:
print Dumper($variable);

=cut
Loading