From 9e6d873553829e1006dae7c92e278b4263aac864 Mon Sep 17 00:00:00 2001 From: hmdne <54514036+hmdne@users.noreply.github.com> Date: Wed, 1 May 2024 03:52:57 +0200 Subject: [PATCH 1/2] Update GeoTest to support new format. In particular, the following fields are not available in new format: - full_name_rg The following fields have been renamed: - full_name_ro -> full_name - lcd -> lang_cd --- Gemfile.lock | 19 ------------------- extract_source.sh | 16 ++++++++++++++++ test.rb | 13 ++++++------- 3 files changed, 22 insertions(+), 26 deletions(-) delete mode 100644 Gemfile.lock create mode 100755 extract_source.sh diff --git a/Gemfile.lock b/Gemfile.lock deleted file mode 100644 index b6ee2fe..0000000 --- a/Gemfile.lock +++ /dev/null @@ -1,19 +0,0 @@ -GEM - remote: https://rubygems.org/ - specs: - interscript (2.4.5) - interscript-maps (~> 2.4.0a) - text - thor - interscript-maps (2.4.3) - text (1.3.1) - thor (1.3.0) - -PLATFORMS - x86_64-linux - -DEPENDENCIES - interscript - -BUNDLED WITH - 2.4.21 diff --git a/extract_source.sh b/extract_source.sh new file mode 100755 index 0000000..6ce19de --- /dev/null +++ b/extract_source.sh @@ -0,0 +1,16 @@ +#!/bin/bash +# Provide an absolute path to location of All_Countries.7z +# Get it from: https://geonames.nga.mil/geonames/GNSData/ + +rm -rf sources +mkdir -p sources +pushd sources +7z x "$1" +for i in *.zip; do + COUNTRY="$(echo $i | sed s/.zip//g)" + mkdir -p "$COUNTRY" + pushd "$COUNTRY" + unzip "../$i" + popd + rm -f "$i" +done diff --git a/test.rb b/test.rb index c8d00f2..b582291 100644 --- a/test.rb +++ b/test.rb @@ -163,13 +163,10 @@ def analyze_good_clusters next end compiler = Interscript.load(map_id, maps, compiler: Interscript::Compiler::Ruby) - result_fnro = compiler.(original.full_name_ro) - result_fnrg = compiler.(original.full_name_rg) + result_fn = compiler.(original.full_name) - if error = compare_and_return_error(result_fnro, i.full_name_ro) - results[transl] << {error: error, group: group, result: [result_fnro, result_fnrg]} - elsif error = compare_and_return_error(result_fnrg, i.full_name_rg) - results[transl] << {error: error, group: group, result: [result_fnro, result_fnrg]} + if error = compare_and_return_error(result_fn, i.full_name) + results[transl] << {error: error, group: group, result: result_fn} else results[transl] << {ok: true, group: group} end @@ -216,10 +213,12 @@ def analyze_translit_systems end class Name - FIELDS=%i[ufi uni mgrs nt lc full_name_ro full_name_rg name_link transl_cd] + FIELDS=%i[ufi uni mgrs nt lang_cd full_name name_link transl_cd] INT_FIELDS=%i[ufi uni name_link] attr_accessor *FIELDS + alias lc lang_cd + def initialize(geotest, **kwargs) @geotest = geotest kwargs.each do |k,v| From 0c845c588964331b74a4b361dd9c0e4e385f86ed Mon Sep 17 00:00:00 2001 From: hmdne <54514036+hmdne@users.noreply.github.com> Date: Wed, 1 May 2024 06:38:54 +0200 Subject: [PATCH 2/2] Add a possiblity to generate an error file. --- .gitignore | 5 ++ errors_documentation.md | 68 +++++++++++++++++++ test.rb | 143 ++++++++++++++++++++++++++++++++++++---- test_all.sh | 2 + test_single.sh | 7 ++ 5 files changed, 212 insertions(+), 13 deletions(-) create mode 100644 .gitignore create mode 100644 errors_documentation.md create mode 100755 test_all.sh create mode 100755 test_single.sh diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1f21f3f --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +/Gemfile.lock +/sources +/output +_* + diff --git a/errors_documentation.md b/errors_documentation.md new file mode 100644 index 0000000..f4f5ede --- /dev/null +++ b/errors_documentation.md @@ -0,0 +1,68 @@ +# Errors file documentation + +The errors file is a TSV file generated by GeoTest. + +Errors are grouped by ID. If a particular stage has found an error with multiple records, all are grouped by the `error_id`. + +**Interscript bugs are not reported by default**. Please run this script with `--bugs` to also return Interscript bugs in the report. + +# `error_type` + +## Pre-analysis errors + +### uni_duplicate +In analyzed file, a record is supplied twice. UNI is expected to be an unique key, yet in the analyzed file more than one +record contains that key. + +## Clustering errors +During the clustering phase, we attempt to generate a cluster of entries based on their `name_link` attribute. In theory, this should create +a 1-1 connection between a script (non-transliterated) entry and a transliterated entry. In practice, sometimes a cluster contains more than +2 entries - we tolerate that as long as there is exactly 1 script (non-transliterated) entry in such a cluster. Otherwise we report one +of the following errors: + +### length +An entry is expected to be a cluster yet it contains less than 1 record. + +### no_script +A cluster contains no entry pertaining to the script (original; non-transliterated) entry. + +### no_transl +None of the cluster entries contain information about transliteration. **This will make us attempt to find the matching transliteration system.** + +### too_much_script +A cluster contains more than one script (original; non-transliterated) entry. + +### no_map +**This is an Interscript bug**. This entry means that there is no Interscript support for a given transliteration map. + +## Testing errors +During the testing phase, we attempt to use the transliteration info (`transl_cd`) to transliterate the original name, then compare both records. + +Since we are able to transliterate, a field `attempted_transliteration` will contain our attempted automated transliteration. + +All those errors are reported, but it's possible that + +### casing +The entries match except for their casing. + +### punctuation +The entries match except for their punctuation. + +### casing_and_punctuation +The entries match except for their casing AND punctuation. + +### spacing_or_punctuation +The entries match except for their spacing OR punctuation. + +### casing_and_spacing_or_punctuation +The entries match except for their casing AND (spacing OR punctuation). + +### transliteration +The entries **don't** match. **This will make us attempt to find the matching transliteration system.** + +# Finding the transliteration system +In case an error type is `no_transl` or `transliteration` we will attempt to find the closest transliteration system. If there is one that +matches 100% (as reported 0 by the Levenshtein distance algorithm), a field `other_matching_maps` will contain a comma-separated list of maps +supported by Interscript that, when used, produce a 100% correct transliteration. + +Do note, that in the case `lang_cd` is supplied, only maps pertaining to that language are tried to save some analysis time. diff --git a/test.rb b/test.rb index b582291..1778c8e 100644 --- a/test.rb +++ b/test.rb @@ -1,11 +1,16 @@ require "interscript" require "interscript/compiler/ruby" require "optparse" +require "csv" class Interscript::GeoTest - def initialize(file, verbose: false) + def initialize(file, verbose: false, report_bugs: false, error_file: nil) @file = file @verbose = verbose + @errors = [] + @report_bugs = report_bugs + @error_file = error_file + @last_id = 0 end def start @@ -21,6 +26,8 @@ def start analyze_translit_systems analyze_usability_of_related_clusters analyze_good_clusters + # --- output --- + output_found_errors end def self.start(...) = new(...).start @@ -68,8 +75,11 @@ def cluster_by_related attr_reader :records, :records_by_ufi, :records_by_uni, :records_by_transl, :related_clusters, :unique_related_clusters def analyze_uni_uniqueness - count = @records_by_uni.values.select { |i| i.length > 1 }.count - puts "#{count} records have a non-unique UNI (should be 0)" + duplicates = @records_by_uni.values.select { |i| i.length > 1 } + puts "#{duplicates.count} records have a non-unique UNI (should be 0)" + duplicates.each do |name| + add_error :uni_duplicate, name + end puts end @@ -94,18 +104,23 @@ def analyze_usability_of_related_clusters @unique_related_clusters.each do |cluster| if cluster.length < 2 # A bug - likely due to wrong data + add_error :length, cluster errors[:length] << cluster - elsif cluster.none? { |i| %w[NS DS VS].include? i.nt } + elsif cluster.none?(&:script?) # We can do nothing about it + add_error :no_script, cluster errors[:no_script] << cluster elsif cluster.none? { |i| i.transl_cd != '' } # TODO: Add some heuristics per run? + add_error :no_transl, cluster errors[:no_transl] << cluster - elsif cluster.count { |i| %w[NS DS VS].include? i.nt } > 1 + elsif cluster.count(&:script?) > 1 # TODO: split those by some heuristic like by LC + add_error :too_much_script, cluster errors[:too_much_script] << cluster elsif cluster.none? { |i| geo_to_is i.transl_cd } # We don't have a usable map for those entries + add_error :no_map, cluster errors[:no_map] << cluster else good << cluster @@ -124,32 +139,38 @@ def analyze_usability_of_related_clusters @good_clusters = good end - def compare_and_return_error(first, second) + def compare_and_return_error(first, second, group) if first == second nil elsif first.downcase == second.downcase + add_error :casing, group, attempted_transliteration: first "Incorrect casing" elsif first.gsub(/[^[:alpha:][:space:]]/,'') == second.gsub(/[^[:alpha:][:space:]]/,'') + add_error :punctuation, group, attempted_transliteration: first "Incorrect punctuation" elsif first.downcase.gsub(/[^[:alpha:][:space:]]/,'') == second.downcase.gsub(/[^[:alpha:][:space:]]/,'') + add_error :casing_and_punctuation, group, attempted_transliteration: first "Incorrect casing and punctuation" elsif first.gsub(/[^[:alpha:]]/,'') == second.gsub(/[^[:alpha:]]/,'') + add_error :spacing_or_punctuation, group, attempted_transliteration: first "Incorrect spacing or punctuation" elsif first.downcase.gsub(/[^[:alpha:]]/,'') == second.downcase.gsub(/[^[:alpha:]]/,'') + add_error :casing_and_spacing_or_punctuation, group, attempted_transliteration: first "Incorrect casing and (spacing or punctuation)" else + add_error :transliteration, group, attempted_transliteration: first "Incorrect transliteration" end end def analyze_good_clusters results = {} - maps = {} + $maps ||= {} @good_clusters.each do |cluster| cluster = cluster.dup - original = cluster.find { |i| %w[NS DS VS].include? i.nt } + original = cluster.find(&:script?) cluster.delete(original) # The rest of entries in the cluster are transliterated entries @@ -162,10 +183,10 @@ def analyze_good_clusters results[transl] << {error: "No support in Interscript", group: group} next end - compiler = Interscript.load(map_id, maps, compiler: Interscript::Compiler::Ruby) + compiler = Interscript.load(map_id, $maps, compiler: Interscript::Compiler::Ruby) result_fn = compiler.(original.full_name) - if error = compare_and_return_error(result_fn, i.full_name) + if error = compare_and_return_error(result_fn, i.full_name, group) results[transl] << {error: error, group: group, result: result_fn} else results[transl] << {ok: true, group: group} @@ -212,8 +233,32 @@ def analyze_translit_systems puts end + def output_found_errors + if @error_file + errors = @errors.map(&:to_h) + + CSV.open(@error_file, "wb", col_sep: "\t") do |csv| + csv << Error::KEYS + errors.each do |hash| + csv << hash.values + end + end + end + end + + def add_error(type, names, **kwargs) + # Skip reporting Interscript bugs by default + return if !@report_bugs && %i[no_map].include?(type) + + names = Array(names) + @last_id += 1 + names.each do |name| + @errors << Error.new(@last_id, type, name, **kwargs) + end + end + class Name - FIELDS=%i[ufi uni mgrs nt lang_cd full_name name_link transl_cd] + FIELDS=%i[ufi uni mgrs nt lang_cd full_name name_link transl_cd script_cd] INT_FIELDS=%i[ufi uni name_link] attr_accessor *FIELDS @@ -241,6 +286,65 @@ def related def related_cluster @geotest.related_clusters[uni] || [] end + + def script? + %w[NS DS VS].include? nt + end + end + + class Error + KEYS=%i[error_id error_type ufi uni nt full_name lang_cd transl_cd script_cd + attempted_transliteration other_matching_maps] + + def initialize(id, type, name, attempted_transliteration: nil) + @id, @type, @name = id, type, name + @attempted_transliteration = attempted_transliteration + end + attr_reader :id, :type, :name, :attempted_transliteration, :other_matching_maps + + def determine_other_matching_maps + return if name.script? + + script_name = name.related_cluster.find(&:script?).full_name + transliterated_name = name.full_name + + if name.lang_cd == "" + $stderr.puts "* Warning: a record with UFI #{name.ufi} has no lang_cd. Trying all maps - may take some time." + end + + result = Interscript.detect( + script_name, + transliterated_name, + compiler: Interscript::Compiler::Ruby, + cache: $cache, + multiple: true, + map_pattern: name.lang_cd != "" ? "*-#{name.lang_cd}-*" : "*" + ) + result = result.select { |_,v| v == 0 }.to_h.keys + result = result.join(", ") + @other_matching_maps = result + end + + def to_h + if %i[no_transl transliteration].include? type + determine_other_matching_maps + end + + {error_id: id, + error_type: type, + + ufi: name.ufi, + uni: name.uni, + nt: name.nt, + full_name: name.full_name, + lang_cd: name.lang_cd, + transl_cd: name.transl_cd, + script_cd: name.script_cd, + + attempted_transliteration: attempted_transliteration, + other_matching_maps: other_matching_maps + } + end end end @@ -248,8 +352,21 @@ def related_cluster OptionParser.new do |opts| opts.banner = "Usage: #{$0} [options] file" - opts.on("-v", "--verbose", "Describe all failures") do - options[:verbose] = true + # This function is obsolete. Please use the error file facility. + # opts.on("-v", "--verbose", "Describe all failures") do + # options[:verbose] = true + # end + + opts.on("-b", "--bugs", "Report interscript bugs in error file") do + options[:report_bugs] = true + end + + opts.on("-o", "--output=FILE", "Output the analysis summary to FILE") do |file| + $stdout = File.open(file, 'w') + end + + opts.on("-e", "--error-file=FILE", "Generate a TSV error file, containing all found errors") do |file| + options[:error_file] = file end opts.on("-h", "--help", "Prints this help") do diff --git a/test_all.sh b/test_all.sh new file mode 100755 index 0000000..766bdf5 --- /dev/null +++ b/test_all.sh @@ -0,0 +1,2 @@ +#!/bin/bash +find sources -mindepth 1 -maxdepth 1 -type d -print0 | cut -z -d/ -f2 | xargs -0 -P`nproc` -n1 ./test_single.sh diff --git a/test_single.sh b/test_single.sh new file mode 100755 index 0000000..26f367f --- /dev/null +++ b/test_single.sh @@ -0,0 +1,7 @@ +#!/bin/bash +COUNTRY="$1" + +mkdir -p "output/$COUNTRY" +echo "* Started: $COUNTRY" +bundle exec ruby test.rb --output="output/$COUNTRY/result.txt" --error-file="output/$COUNTRY/errors.tsv" "sources/$COUNTRY/$COUNTRY.txt" +echo "* Finished: $COUNTRY"