interscript · hmdne · May 1, 2024 · May 1, 2024
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,5 @@
+/Gemfile.lock
+/sources
+/output
+_*
+
diff --git a/Gemfile.lock b/Gemfile.lock
diff --git a/errors_documentation.md b/errors_documentation.md
@@ -0,0 +1,68 @@
+# Errors file documentation
+
+The errors file is a TSV file generated by GeoTest.
+
+Errors are grouped by ID. If a particular stage has found an error with multiple records, all are grouped by the `error_id`.
+
+**Interscript bugs are not reported by default**. Please run this script with `--bugs` to also return Interscript bugs in the report.
+
+# `error_type`
+
+## Pre-analysis errors
+
+### uni_duplicate
+In analyzed file, a record is supplied twice. UNI is expected to be an unique key, yet in the analyzed file more than one
+record contains that key.
+
+## Clustering errors
+During the clustering phase, we attempt to generate a cluster of entries based on their `name_link` attribute. In theory, this should create
+a 1-1 connection between a script (non-transliterated) entry and a transliterated entry. In practice, sometimes a cluster contains more than
+2 entries - we tolerate that as long as there is exactly 1 script (non-transliterated) entry in such a cluster. Otherwise we report one
+of the following errors:
+
+### length
+An entry is expected to be a cluster yet it contains less than 1 record.
+
+### no_script
+A cluster contains no entry pertaining to the script (original; non-transliterated) entry.
+
+### no_transl
+None of the cluster entries contain information about transliteration. **This will make us attempt to find the matching transliteration system.**
+
+### too_much_script
+A cluster contains more than one script (original; non-transliterated) entry.
+
+### no_map
+**This is an Interscript bug**. This entry means that there is no Interscript support for a given transliteration map.
+
+## Testing errors
+During the testing phase, we attempt to use the transliteration info (`transl_cd`) to transliterate the original name, then compare both records.
+
+Since we are able to transliterate, a field `attempted_transliteration` will contain our attempted automated transliteration.
+
+All those errors are reported, but it's possible that 
+
+### casing
+The entries match except for their casing.
+
+### punctuation
+The entries match except for their punctuation.
+
+### casing_and_punctuation
+The entries match except for their casing AND punctuation.
+
+### spacing_or_punctuation
+The entries match except for their spacing OR punctuation.
+
+### casing_and_spacing_or_punctuation
+The entries match except for their casing AND (spacing OR punctuation).
+
+### transliteration
+The entries **don't** match. **This will make us attempt to find the matching transliteration system.**
+
+# Finding the transliteration system
+In case an error type is `no_transl` or `transliteration` we will attempt to find the closest transliteration system. If there is one that
+matches 100% (as reported 0 by the Levenshtein distance algorithm), a field `other_matching_maps` will contain a comma-separated list of maps
+supported by Interscript that, when used, produce a 100% correct transliteration.
+
+Do note, that in the case `lang_cd` is supplied, only maps pertaining to that language are tried to save some analysis time.
diff --git a/extract_source.sh b/extract_source.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+# Provide an absolute path to location of All_Countries.7z
+# Get it from: https://geonames.nga.mil/geonames/GNSData/
+
+rm -rf sources
+mkdir -p sources
+pushd sources
+7z x "$1"
+for i in *.zip; do
+  COUNTRY="$(echo $i | sed s/.zip//g)"
+  mkdir -p "$COUNTRY"
+  pushd "$COUNTRY"
+  unzip "../$i"
+  popd
+  rm -f "$i"
+done
diff --git a/test.rb b/test.rb
@@ -1,11 +1,16 @@
 require "interscript"
 require "interscript/compiler/ruby"
 require "optparse"
+require "csv"
 
 class Interscript::GeoTest
-  def initialize(file, verbose: false)
+  def initialize(file, verbose: false, report_bugs: false, error_file: nil)
     @file = file
     @verbose = verbose
+    @errors = []
+    @report_bugs = report_bugs
+    @error_file = error_file
+    @last_id = 0
   end
 
   def start
@@ -21,6 +26,8 @@ def start
     analyze_translit_systems
     analyze_usability_of_related_clusters
     analyze_good_clusters
+    # --- output ---
+    output_found_errors
   end
 
   def self.start(...) = new(...).start
@@ -68,8 +75,11 @@ def cluster_by_related
   attr_reader :records, :records_by_ufi, :records_by_uni, :records_by_transl, :related_clusters, :unique_related_clusters
 
   def analyze_uni_uniqueness
-    count = @records_by_uni.values.select { |i| i.length > 1 }.count
-    puts "#{count} records have a non-unique UNI (should be 0)"
+    duplicates = @records_by_uni.values.select { |i| i.length > 1 }
+    puts "#{duplicates.count} records have a non-unique UNI (should be 0)"
+    duplicates.each do |name|
+      add_error :uni_duplicate, name
+    end
     puts
   end
 
@@ -94,18 +104,23 @@ def analyze_usability_of_related_clusters
     @unique_related_clusters.each do |cluster|
       if cluster.length < 2
         # A bug - likely due to wrong data
+        add_error :length, cluster
         errors[:length] << cluster
-      elsif cluster.none? { |i| %w[NS DS VS].include? i.nt }
+      elsif cluster.none?(&:script?)
         # We can do nothing about it
+        add_error :no_script, cluster
         errors[:no_script] << cluster
       elsif cluster.none? { |i| i.transl_cd != '' }
         # TODO: Add some heuristics per run?
+        add_error :no_transl, cluster
         errors[:no_transl] << cluster
-      elsif cluster.count { |i| %w[NS DS VS].include? i.nt } > 1
+      elsif cluster.count(&:script?) > 1
         # TODO: split those by some heuristic like by LC
+        add_error :too_much_script, cluster
         errors[:too_much_script] << cluster
       elsif cluster.none? { |i| geo_to_is i.transl_cd }
         # We don't have a usable map for those entries
+        add_error :no_map, cluster
         errors[:no_map] << cluster
       else
         good << cluster
@@ -124,32 +139,38 @@ def analyze_usability_of_related_clusters
     @good_clusters = good
   end
 
-  def compare_and_return_error(first, second)
+  def compare_and_return_error(first, second, group)
     if first == second
       nil
     elsif first.downcase == second.downcase
+      add_error :casing, group, attempted_transliteration: first
       "Incorrect casing"
     elsif first.gsub(/[^[:alpha:][:space:]]/,'') == second.gsub(/[^[:alpha:][:space:]]/,'')
+      add_error :punctuation, group, attempted_transliteration: first
       "Incorrect punctuation"
     elsif first.downcase.gsub(/[^[:alpha:][:space:]]/,'') == second.downcase.gsub(/[^[:alpha:][:space:]]/,'')
+      add_error :casing_and_punctuation, group, attempted_transliteration: first
       "Incorrect casing and punctuation"
     elsif first.gsub(/[^[:alpha:]]/,'') == second.gsub(/[^[:alpha:]]/,'')
+      add_error :spacing_or_punctuation, group, attempted_transliteration: first
       "Incorrect spacing or punctuation"
     elsif first.downcase.gsub(/[^[:alpha:]]/,'') == second.downcase.gsub(/[^[:alpha:]]/,'')
+      add_error :casing_and_spacing_or_punctuation, group, attempted_transliteration: first
       "Incorrect casing and (spacing or punctuation)"
     else
+      add_error :transliteration, group, attempted_transliteration: first
       "Incorrect transliteration"
     end
   end
 
   def analyze_good_clusters
     results = {}
-    maps = {}
+    $maps ||= {}
 
     @good_clusters.each do |cluster|
       cluster = cluster.dup
 
-      original = cluster.find { |i| %w[NS DS VS].include? i.nt }
+      original = cluster.find(&:script?)
       cluster.delete(original)
 
       # The rest of entries in the cluster are transliterated entries
@@ -162,14 +183,11 @@ def analyze_good_clusters
           results[transl] << {error: "No support in Interscript", group: group}
           next
         end
-        compiler = Interscript.load(map_id, maps, compiler: Interscript::Compiler::Ruby)
-        result_fnro = compiler.(original.full_name_ro)
-        result_fnrg = compiler.(original.full_name_rg)
-
-        if error = compare_and_return_error(result_fnro, i.full_name_ro)
-          results[transl] << {error: error, group: group, result: [result_fnro, result_fnrg]}
-        elsif error = compare_and_return_error(result_fnrg, i.full_name_rg)
-          results[transl] << {error: error, group: group, result: [result_fnro, result_fnrg]}
+        compiler = Interscript.load(map_id, $maps, compiler: Interscript::Compiler::Ruby)
+        result_fn = compiler.(original.full_name)
+
+        if error = compare_and_return_error(result_fn, i.full_name, group)
+          results[transl] << {error: error, group: group, result: result_fn}
         else
           results[transl] << {ok: true, group: group}
         end
@@ -215,11 +233,37 @@ def analyze_translit_systems
     puts
   end
 
+  def output_found_errors
+    if @error_file
+      errors = @errors.map(&:to_h)
+
+      CSV.open(@error_file, "wb", col_sep: "\t") do |csv|
+        csv << Error::KEYS
+        errors.each do |hash|
+          csv << hash.values
+        end
+      end
+    end
+  end
+
+  def add_error(type, names, **kwargs)
+    # Skip reporting Interscript bugs by default
+    return if !@report_bugs && %i[no_map].include?(type)
+
+    names = Array(names)
+    @last_id += 1
+    names.each do |name|
+      @errors << Error.new(@last_id, type, name, **kwargs)
+    end
+  end
+
   class Name
-    FIELDS=%i[ufi uni mgrs nt lc full_name_ro full_name_rg name_link transl_cd]
+    FIELDS=%i[ufi uni mgrs nt lang_cd full_name name_link transl_cd script_cd]
     INT_FIELDS=%i[ufi uni name_link]
     attr_accessor *FIELDS
 
+    alias lc lang_cd
+
     def initialize(geotest, **kwargs)
       @geotest = geotest
       kwargs.each do |k,v|
@@ -242,15 +286,87 @@ def related
     def related_cluster
       @geotest.related_clusters[uni] || []
     end
+
+    def script?
+      %w[NS DS VS].include? nt
+    end
+  end
+
+  class Error
+    KEYS=%i[error_id error_type ufi uni nt full_name lang_cd transl_cd script_cd
+            attempted_transliteration other_matching_maps]
+
+    def initialize(id, type, name, attempted_transliteration: nil)
+      @id, @type, @name = id, type, name
+      @attempted_transliteration = attempted_transliteration
+    end
+    attr_reader :id, :type, :name, :attempted_transliteration, :other_matching_maps
+
+    def determine_other_matching_maps
+      return if name.script?
+
+      script_name = name.related_cluster.find(&:script?).full_name
+      transliterated_name = name.full_name
+
+      if name.lang_cd == ""
+        $stderr.puts "* Warning: a record with UFI #{name.ufi} has no lang_cd. Trying all maps - may take some time."
+      end
+
+      result = Interscript.detect(
+        script_name,
+        transliterated_name,
+        compiler: Interscript::Compiler::Ruby,
+        cache: $cache,
+        multiple: true,
+        map_pattern: name.lang_cd != "" ? "*-#{name.lang_cd}-*" : "*"
+      )
+      result = result.select { |_,v| v == 0 }.to_h.keys
+      result = result.join(", ")
+      @other_matching_maps = result
+    end
+
+    def to_h
+      if %i[no_transl transliteration].include? type
+        determine_other_matching_maps
+      end
+
+      {error_id: id,
+       error_type: type,
+
+       ufi: name.ufi,
+       uni: name.uni,
+       nt: name.nt,
+       full_name: name.full_name,
+       lang_cd: name.lang_cd,
+       transl_cd: name.transl_cd,
+       script_cd: name.script_cd,
+
+       attempted_transliteration: attempted_transliteration,
+       other_matching_maps: other_matching_maps
+      }
+    end
   end
 end
 
 options = {}
 OptionParser.new do |opts|
   opts.banner = "Usage: #{$0} [options] file"
 
-  opts.on("-v", "--verbose", "Describe all failures") do
-    options[:verbose] = true
+  # This function is obsolete. Please use the error file facility.
+  # opts.on("-v", "--verbose", "Describe all failures") do
+  #   options[:verbose] = true
+  # end
+
+  opts.on("-b", "--bugs", "Report interscript bugs in error file") do
+    options[:report_bugs] = true
+  end
+
+  opts.on("-o", "--output=FILE", "Output the analysis summary to FILE") do |file|
+    $stdout = File.open(file, 'w')
+  end
+
+  opts.on("-e", "--error-file=FILE", "Generate a TSV error file, containing all found errors") do |file|
+    options[:error_file] = file
   end
 
   opts.on("-h", "--help", "Prints this help") do

diff --git a/test_all.sh b/test_all.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+find sources -mindepth 1 -maxdepth 1 -type d -print0 | cut -z -d/ -f2 | xargs -0 -P`nproc` -n1 ./test_single.sh
diff --git a/test_single.sh b/test_single.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+COUNTRY="$1"
+
+mkdir -p "output/$COUNTRY"
+echo "* Started: $COUNTRY"
+bundle exec ruby test.rb --output="output/$COUNTRY/result.txt" --error-file="output/$COUNTRY/errors.tsv" "sources/$COUNTRY/$COUNTRY.txt"
+echo "* Finished: $COUNTRY"
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		#!/bin/bash
		find sources -mindepth 1 -maxdepth 1 -type d -print0 \| cut -z -d/ -f2 \| xargs -0 -P`nproc` -n1 ./test_single.sh