diff --git a/analysis/README.md b/analysis/README.md deleted file mode 100644 index 44f02baa..00000000 --- a/analysis/README.md +++ /dev/null @@ -1,13 +0,0 @@ -# Ancestral State Reconstruction based on Correspondence Patterns - -This project uses correspondence pattern identification in a subset of the NorthPeruLex dataset with ≥ 140 concepts in common to infer ancestral states of the sounds through out the trees. - -## Instructions to run the workflow - -The `analysis` directory where this `README.md` file is located contains two scripts that can be run from the command line as ordinary python scripts. -They are written here in order of implementation: (1) `s_aling.py` and (2) `s_ASR_patterns.py`. -One can run one script after another as the workflow. -Each one of the scripts fulfills the tasks described next: - -1. Load the dataset, compute the concept mutual coverage across the data and select a subset of languages with a a total of 140 concepts in common, and cluster and align the data -3. Automatically infer the correspondence patterns found in the subset and perform Sankoff's Maximum Parsimony algorithm for Ancestral State Reconstruction out of the correspondence patterns \ No newline at end of file diff --git a/analysis/get_stats.py b/analysis/get_stats.py new file mode 100644 index 00000000..52063110 --- /dev/null +++ b/analysis/get_stats.py @@ -0,0 +1,86 @@ +from collections import defaultdict +from pathlib import Path +from lingpy import Wordlist +from lingpy.compare.sanity import average_coverage, mutual_coverage +from tabulate import tabulate + +path = Path(".") +wl = Wordlist.from_cldf( + "../cldf/cldf-metadata.json", + columns=( + "concept_id", + "language_id", + "concept_name", + "language_name", + "value", + "segments", + "form", + "glottocode", + "concept_concepticon_ide", + "source", + "language_subgroup" + ), + namespace=( + ("language_id", "doculect"), + ("language_subgroup", "subgroup"), + ("concept_name", "concept"), + ("segments", "tokens"), + ("cognacy", "cogid"), + ("source", "source") + ) +) + +lang_count = defaultdict(int) +total_concepts = len(set(wl.get_list("concept", flat=True))) + +for item in wl: + lang = wl[item, "doculect"] + lang_count[lang] = lang_count[lang] + 1 + +table_data = [] +covs_total = 0 + +for lang in sorted(lang_count.keys()): + # Calculate coverage percentage + coverage_pctg = round((lang_count[lang] / total_concepts) * 100, 1) + covs_total += coverage_pctg + + indices = wl.get_list(col=lang, flat=True) + if indices: + idx = indices[0] + lang_name = wl[idx, "language_name"] + glottocode = wl[idx, "glottocode"] + subgroup = wl[idx, "subgroup"] + source = wl[idx, "source"] + + table_data.append([ + lang_name, + glottocode, + subgroup, + source, + f"{lang_count[lang]} ({coverage_pctg}%)" + ]) + +table = tabulate( + table_data, + headers=[ + "Language", + "Glottocode", + "Language Family", + "Sources", + "Coverage" + ], + tablefmt="pipe" +) + +print(table) + +avg_coverage = round(covs_total / len(lang_count), 2) +avg_mutualcov = average_coverage(wl) + +print(f"Average coverage: {avg_coverage}%") +print(f"Average mutual coverage: {avg_mutualcov:2f}") + +for i in range(total_concepts, 0, -1): + if mutual_coverage(wl, i): + print(f"Minimal mutual coverage: {i} concept pairs") \ No newline at end of file diff --git a/analysis/s_ASR_patterns.py b/analysis/s_ASR_patterns.py deleted file mode 100644 index 56fa418d..00000000 --- a/analysis/s_ASR_patterns.py +++ /dev/null @@ -1,157 +0,0 @@ -from lingpy import * -import os -from lingrex.util import add_structure -from lingrex.copar import CoPaR, consensus_pattern -from collections import defaultdict -from lingpy.compare.strings import ldn_swap, bidist2, tridist2 -from lingpy.algorithm.clustering import neighbor -import numpy as np -from pylotree import Tree, NodeLabels -from pyloparsimony import up, down, parsimony -from pyloparsimony.util import scenario_ascii_art -import newick - -output_directory = 'CP_trees' -if not os.path.exists(output_directory): - os.makedirs(output_directory) - -def compute_matrix(symbols): - """This function computes the similarity matrix for each correspondence pattern - out of Levenstein distance with swap included.""" - filtered = [s for s in symbols if s != 'Ø'] - n = len(filtered) - matrix = np.zeros((n, n)) - for i in range(n): - for j in range(n): - levenshtein_distance = ldn_swap(filtered[i], filtered[j], normalized=True) - bigram_distance = bidist2(filtered[i], filtered[j], normalized=True) - trigram_distance = tridist2(filtered[i], filtered[j], normalized=True) - combined_distance = ( - 0.34 * levenshtein_distance + - 0.33 * bigram_distance + - 0.33 * trigram_distance - ) - matrix[i, j] = combined_distance - - return filtered, matrix - - -# Get the patterns -alms = Alignments("npl.tsv", ref="cogids", transcription="form") -alms.align() -add_structure(alms, model="cv") -alms.output('tsv', filename="npl-aligned", ignore="all", prettify=False) - -cop = CoPaR("npl-aligned.tsv", min_refs=5, ref='cogids', transcription="form") -cop.get_sites() -cop.cluster_sites() -cop.sites_to_pattern() -cop.add_patterns() -cop.write_patterns("npl-patterns.tsv") - -# Get consensus patterns -pattern_dict = defaultdict(list) -for key, pattern_list in cop.patterns.items(): - for pattern in pattern_list: - sounds = pattern[2] - for doculect, sound in zip(cop.cols, sounds): - pattern_dict[key].append((doculect, sounds)) - -consensus_patterns = {} -doculect_sounds_mappings = {} -for key, docusound_list in pattern_dict.items(): - sounds = [s for _, s in docusound_list] - doculects = [d for d, _ in docusound_list] - try: - consensus = consensus_pattern(sounds, missing="Ø") - consensus_patterns[key] = consensus - doculect_sounds_mappings[key] = [ - f"{doc}_{sound}" for doc, sound in zip(doculects, sounds) if sound != "Ø" - ] - except ValueError: - print(f"Incompatible patterns: {key}") - -# Build trees -# Get the distance matrices -for key, values in consensus_patterns.items(): - doc_sound_pairs = [ - (doc, sound) for doc, sound in zip(cop.cols, values) if sound != "Ø" - ] - if len(doc_sound_pairs) < 2: - continue - taxa = [f"{doc}_{sound}" for doc, sound in doc_sound_pairs] - sounds = [sound for _, sound in doc_sound_pairs] - - filtered_sounds, matrix = compute_matrix(sounds) - - if len(filtered_sounds) != len(taxa) or matrix.shape[0] != len(taxa): - continue - - headers = [""] + taxa - table = [ - [taxa[i]] + [f"{val:.2f}" for val in matrix[i]] - for i in range(len(taxa)) - ] - - dist_matrix = matrix.tolist() - - # Apply NJ for trees construction - nwk_tree = neighbor(dist_matrix, taxa) - parsed_newick = newick.loads(nwk_tree) - # Handling trees - parsed_tree = Tree(nwk_tree, name=f"{key[0]}_{key[1]}") - labeled_tree = parsed_tree.newick - - # Output tree - output_filename = os.path.join( - output_directory, f"tree_cogid{key[0]}_slot{key[1]}.nwk" - ) - with open(output_filename, "w") as f: - f.write(labeled_tree) - - # Maximum Parsimony algorithm for ancestral state reconstruction - pattern = { - f"{doc}_{sound}": [sound] - for doc, sound in doc_sound_pairs - } - characterlist = sorted(set([sound for _, sound in doc_sound_pairs])) - #print(characterlist) - - matrix_dict = { - (i, j): matrix[i][j] - for i in range(len(filtered_sounds)) - for j in range(len(filtered_sounds)) - } - sound_index = {sound: i for i, sound in enumerate(filtered_sounds)} - penalty_matrix = [] - for c1 in characterlist: - row = [] - for c2 in characterlist: - if c1 == c2: - cost = 0.0 - elif c1 == '-': - cost = 2.0 # Gain is more costly - elif c2 == '-': - cost = 1.0 # Loss is less costly - else: - cost = 1.0 # Substitution - row.append(cost) - penalty_matrix.append(row) - - # Computer costs UP and DOWN the tree - W = up( - parsed_tree, - characterlist, - penalty_matrix, - pattern - ) - scenarios = down( - parsed_tree, - characterlist, - penalty_matrix, - W - ) - - best = scenarios[0] - tree_art = scenario_ascii_art(best, parsed_tree) - print(tree_art) \ No newline at end of file diff --git a/analysis/s_align.py b/analysis/s_align.py deleted file mode 100644 index 2f64328e..00000000 --- a/analysis/s_align.py +++ /dev/null @@ -1,80 +0,0 @@ -from lingpy import Wordlist, LexStat, Alignments -import re -from lingpy.compare.partial import Partial -from lingpy.sequence.sound_classes import tokens2class -from lingpy.compare.sanity import mutual_coverage_subset - -def clean_slash(x): - """Cleans slash annotation from EDICTOR.""" - cleaned = [] - for segment in x: - if "/" in segment: - after_slash = re.split("/", segment)[1] - cleaned.append(after_slash) - else: - cleaned.append(segment) - - return cleaned - -# Data preprocessing -cols = [ - 'concept_id', 'concept_name', 'language_id', 'language_name', 'value', 'form', 'segments', - 'glottocode', 'concept_concepticon_id', 'comment', 'language_family', 'language_subgroup' -] - -# Load dataset -wl = Wordlist.from_cldf( - '../cldf/cldf-metadata.json', - columns=cols, - namespace=( - ("language_id", "doculect"), - ("language_family", "family"), - ("language_subgroup", "subgroup"), - ("concept_name", "concept"), - ("segments", "tokens"), - ("cognacy", "cogid") - )) - -# Deleting unnecessary tokens with clean_slash -for idx in wl: - wl[idx, "tokens"] = [x.split("/")[1] if "/" in x else x for x in wl[idx, "tokens"]] - -number_of_languages, pairs = mutual_coverage_subset(wl, 100) -for number_of_items, languages in pairs: - print(number_of_items, ', '.join(languages)) - -selected_ls = set().union(*(langs for _, langs in pairs)) - -selected_ls.discard("Proto-Bora-Muinane") - -D = {0: [c for c in wl.columns]} -for idx in wl: - if ( - wl[idx, "doculect"] in selected_ls - ): - D[idx] = [wl[idx, c] for c in D[0]] - -wl_filtered = Wordlist(D) - -#wl_filtered.output( -# "tsv", -# filename="npl-filtered" -#) - -# Run AutoCogid -lex = LexStat(wl_filtered) -lex.get_scorer(runs=10000) -lex.cluster(threshold=0.55, method="lexstat", cluster_method="infomap", ref="cogid") - -# Get morpheme segmentation -parcog = Partial(lex, segments='tokens') -parcog.partial_cluster(threshold=0.55, method="lexstat", ref="cogids") - -# Align data -alms = Alignments(parcog, ref="cogids", transcription="tokens") -alms.align(ref="cogids") -alms.add_entries("morphemes","tokens", lambda x: " ".join([y for y in x]), override=True) -alms.add_entries("alignment", "tokens", lambda x: " ".join([y for y in x]), override=True) -alms.add_entries("structure", "tokens", lambda x: tokens2class(x, "cv")) - -alms.output("tsv", filename="npl") \ No newline at end of file diff --git a/etc/languages.tsv b/etc/languages.tsv index c2d4e189..3d7f63e8 100644 --- a/etc/languages.tsv +++ b/etc/languages.tsv @@ -3,7 +3,7 @@ Yanesha Yanesha' yane1238 Arawakan Adriano Peña2024 Resigaro Resígaro resi1247 Arawakan Adriano Huber2021 Bora Bora bora1263 Boran Carlos Seifart2024 Muinane Muinane muin1242 Boran Carlos Seifart2024 -Proto-Bora-Muinane Proto-Bora-Muinane Boran Carlos Seifart2024 +Proto-Bora-Muinane Proto-Bora-Muinane bora1262 Boran Carlos Seifart2024 Shiwilu Jebero jebe1250 Cahuapanan Frederic Valenzuela2012 Chayahuita Shawi chay1248 Cahuapanan Frederic Johansson2021 AchuarShiwiar Achuar achu1248 Chicham Jaime Fast2008 https://repositorio.cultura.gob.pe/handle/CULTURA/425