-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
ccedb3e
commit 2e2229b
Showing
204 changed files
with
55,897 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
import pandas as pd | ||
|
||
native_data_dict = { | ||
'as' : [], | ||
'brx' : [], | ||
'gom' : [], | ||
'ks' : [], | ||
'mai' : [], | ||
'mni' : [], | ||
'ne' : [], | ||
'or' : [], | ||
'sa' : [], | ||
} | ||
|
||
roman_data_dict = { | ||
'as' : [], | ||
'brx' : [], | ||
'gom' : [], | ||
'ks' : [], | ||
'mai' : [], | ||
'mni' : [], | ||
'ne' : [], | ||
'or' : [], | ||
'sa' : [], | ||
} | ||
|
||
# load from benchmark_reports_pilot_1 | ||
lang_code_list = ['as', 'brx', 'gom', 'ks', 'mai', 'mni', 'ne', 'or', 'sa'] | ||
|
||
for lang_code in lang_code_list: | ||
sheet = pd.read_csv('../benchmark_reports_pilot_1/'+lang_code+'.csv') | ||
|
||
sheet = sheet[['input_text', 'output_text']] | ||
|
||
sheet = sheet.values.tolist() | ||
|
||
for line in sheet: | ||
if line[0] not in native_data_dict[lang_code]: | ||
native_data_dict[lang_code].append(line[0]) | ||
roman_data_dict[lang_code].append(line[1]) | ||
|
||
# load from extra_pilot_1 | ||
xlsx_file_name = '../extra_pilot_1/Extra_sens_pilot_1_combined_final.xlsx' | ||
|
||
sheet_names = ['as', 'brx', 'ks', 'or', 'mai', 'sa'] | ||
|
||
for sheet_name in sheet_names: | ||
sheet = pd.ExcelFile(xlsx_file_name).parse(sheet_name) | ||
# print(sheet) | ||
|
||
sheet = sheet.values.tolist() | ||
|
||
for line in sheet: | ||
if line[0] not in native_data_dict[lang_code]: | ||
native_data_dict[sheet_name].append(line[0]) | ||
roman_data_dict[sheet_name].append(line[1]) | ||
|
||
for lang_code in native_data_dict: | ||
print('lang : ', lang_code) | ||
print('native : ', len(native_data_dict[lang_code][:512])) | ||
print('roman : ', len(roman_data_dict[lang_code][:512])) | ||
|
||
|
||
for lang_code in native_data_dict: | ||
file = open('native_script/'+lang_code+'_native.txt', 'w') | ||
file.write('\n'.join(native_data_dict[lang_code][:512])) | ||
file.close() | ||
|
||
file = open('roman_script/'+lang_code+'_roman.txt', 'w') | ||
file.write('\n'.join(roman_data_dict[lang_code][:512])) | ||
file.close() | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,262 @@ | ||
import random | ||
|
||
|
||
train_data_sources = { | ||
'as' : ['i', 'w', 'v'], | ||
'bn' : ['i', 'w', 'v'], | ||
'brx' : ['i', 'n', 'a', 'v'], | ||
'dg' : ['i', 'v'], | ||
'gom' : ['n', 'a', 'w', 'v'], | ||
'gu' : ['i', 'w', 'v'], | ||
'hi' : ['i', 'w', 'v'], | ||
'kn' : ['i', 'w', 'v'], | ||
'mai' : ['i', 'a', 'w', 'v'], | ||
'ml' : ['i', 'w', 'v'], | ||
'mr' : ['i', 'w', 'v'], | ||
'ne' : ['i', 'a', 'w', 'v'], | ||
'or' : ['i', 'w', 'v'], | ||
'pa' : ['i', 'w', 'v'], | ||
'sa' : ['i', 'n', 'a', 'w', 'v'], | ||
'sat' : ['i', 'n', 'w', 'v'], | ||
'sd' : ['i', 'w'], | ||
'ta' : ['i', 'w', 'v'], | ||
'te' : ['i', 'w', 'v'], | ||
'ur' : ['i', 'a'], | ||
'ks_arab' : ['n', 'a'], | ||
'ks_deva' : ['n'], | ||
'mni_beng' : ['n'], | ||
'mni_mei' : ['i', 'a', 'w'], | ||
'en' : ['i'], | ||
'other' : ['nws'] | ||
} | ||
|
||
|
||
samples = 512 | ||
|
||
lang_code_list = ['dg'] | ||
|
||
|
||
for lang_code in lang_code_list: | ||
|
||
lines_train_clean = [] | ||
|
||
if 'i' in train_data_sources[lang_code]: | ||
train_file_name = '../../preprocess_indiccorp/IndicCorp_data_subset_tok_norm_romanized_100k_sample_cleaned/' + lang_code + '/'+lang_code + '_indic_tok.txt' | ||
file_train_in = open(train_file_name, 'r') | ||
lines_train_in = file_train_in.read().split('\n') | ||
lines_train_clean += lines_train_in | ||
file_train_in.close() | ||
|
||
if 'n' in train_data_sources[lang_code]: | ||
train_file_name = '../../Other_sources/nllb_preprocess/nllb_data_subset_tok_norm_romanized_100k_sample_cleaned/' + lang_code + '/'+lang_code + '_indic_tok.txt' | ||
file_train_in = open(train_file_name, 'r') | ||
lines_train_in = file_train_in.read().split('\n') | ||
lines_train_clean += lines_train_in | ||
file_train_in.close() | ||
|
||
if 'w' in train_data_sources[lang_code]: | ||
train_file_name = '../../preprocess_wikipedia/wikipedia_tok_norm_romanized_100k_sample_cleaned/' + lang_code + '/'+lang_code + '_indic_tok.txt' | ||
file_train_in = open(train_file_name, 'r') | ||
lines_train_in = file_train_in.read().split('\n') | ||
lines_train_clean += lines_train_in | ||
file_train_in.close() | ||
|
||
if 'v' in train_data_sources[lang_code]: | ||
train_file_name = '../../preprocess_vikaspedia/vikaspedia_tok_norm_romanized_100k_sample_cleaned/' + lang_code + '/'+lang_code + '_indic_tok.txt' | ||
file_train_in = open(train_file_name, 'r') | ||
lines_train_in = file_train_in.read().split('\n') | ||
lines_train_clean += lines_train_in | ||
file_train_in.close() | ||
|
||
if 'nws' in train_data_sources[lang_code]: | ||
train_file_name = '../../preprocess_news_crawl/news_tok_norm_romanized_100k_sample_cleaned/' + lang_code + '/'+lang_code + '_indic_tok.txt' | ||
file_train_in = open(train_file_name, 'r') | ||
lines_train_in = file_train_in.read().split('\n') | ||
lines_train_clean += lines_train_in | ||
file_train_in.close() | ||
|
||
if 'a' in train_data_sources[lang_code]: | ||
train_file_name = '../../Other_sources/annotator_ai4bharat_prerocess/train_tok_norm_romanized_100k_sample_cleaned/'+lang_code+'/'+lang_code+'_indic_tok.txt' | ||
file_train_in = open(train_file_name, 'r') | ||
lines_train_in = file_train_in.read().split('\n') | ||
lines_train_clean += lines_train_in | ||
file_train_in.close() | ||
|
||
|
||
lines_train_unclean = [] | ||
|
||
if 'i' in train_data_sources[lang_code]: | ||
train_file_name = '../../preprocess_indiccorp/IndicCorp_data_subset_tok_norm_romanized/' + lang_code + '/'+lang_code + '_indic_tok.txt' | ||
file_train_in = open(train_file_name, 'r') | ||
lines_train_in = file_train_in.read().split('\n') | ||
lines_train_unclean += lines_train_in | ||
file_train_in.close() | ||
|
||
if 'n' in train_data_sources[lang_code]: | ||
train_file_name = '../../Other_sources/nllb_preprocess/nllb_data_subset_tok_norm_romanized/' + lang_code + '/'+lang_code + '_indic_tok.txt' | ||
file_train_in = open(train_file_name, 'r') | ||
lines_train_in = file_train_in.read().split('\n') | ||
lines_train_unclean += lines_train_in | ||
file_train_in.close() | ||
|
||
if 'w' in train_data_sources[lang_code]: | ||
train_file_name = '../../preprocess_wikipedia/wikipedia_tok_norm_romanized/' + lang_code + '/'+lang_code + '_indic_tok.txt' | ||
file_train_in = open(train_file_name, 'r') | ||
lines_train_in = file_train_in.read().split('\n') | ||
lines_train_unclean += lines_train_in | ||
file_train_in.close() | ||
|
||
if 'v' in train_data_sources[lang_code]: | ||
train_file_name = '../../preprocess_vikaspedia/vikaspedia_tok_norm_romanized/' + lang_code + '/'+lang_code + '_indic_tok.txt' | ||
file_train_in = open(train_file_name, 'r') | ||
lines_train_in = file_train_in.read().split('\n') | ||
lines_train_unclean += lines_train_in | ||
file_train_in.close() | ||
|
||
if 'nws' in train_data_sources[lang_code]: | ||
train_file_name = '../../preprocess_news_crawl/news_tok_norm_romanized/' + lang_code + '/'+lang_code + '_indic_tok.txt' | ||
file_train_in = open(train_file_name, 'r') | ||
lines_train_in = file_train_in.read().split('\n') | ||
lines_train_unclean += lines_train_in | ||
file_train_in.close() | ||
|
||
if 'a' in train_data_sources[lang_code]: | ||
train_file_name = '../../Other_sources/annotator_ai4bharat_prerocess/train_tok_norm_romanized/'+lang_code+'/'+lang_code+'_indic_tok.txt' | ||
file_train_in = open(train_file_name, 'r') | ||
lines_train_in = file_train_in.read().split('\n') | ||
lines_train_unclean += lines_train_in | ||
file_train_in.close() | ||
|
||
|
||
|
||
|
||
file_in = open('../../Other_sources/annotator_ai4bharat_train_test_split/test/'+lang_code+'_test.txt', 'r') | ||
lines_in = file_in.read().split('\n') | ||
file_in.close() | ||
|
||
print('len lines_in : ', len(lines_in)) | ||
|
||
lines_in = list(set(lines_in).difference(set(lines_train_clean))) | ||
lines_in = list(set(lines_in).difference(set(lines_train_unclean))) | ||
|
||
print('len lines_in : ', len(lines_in)) | ||
|
||
lines_in = random.choices(lines_in, k = samples) | ||
|
||
file_out = open(lang_code+'_test.txt', 'w') | ||
file_out.write('\n'.join(lines_in)) | ||
file_out.close() | ||
|
||
|
||
# lang_code_list = ['as', 'or'] | ||
|
||
# for lang_code in lang_code_list: | ||
|
||
|
||
# lines_train_clean = [] | ||
|
||
# if 'i' in train_data_sources[lang_code]: | ||
# train_file_name = '../../preprocess_indiccorp/IndicCorp_data_subset_tok_norm_romanized_100k_sample_cleaned/' + lang_code + '/'+lang_code + '_indic_tok.txt' | ||
# file_train_in = open(train_file_name, 'r') | ||
# lines_train_in = file_train_in.read().split('\n') | ||
# lines_train_clean += lines_train_in | ||
# file_train_in.close() | ||
|
||
# if 'n' in train_data_sources[lang_code]: | ||
# train_file_name = '../../Other_sources/nllb_preprocess/nllb_data_subset_tok_norm_romanized_100k_sample_cleaned/' + lang_code + '/'+lang_code + '_indic_tok.txt' | ||
# file_train_in = open(train_file_name, 'r') | ||
# lines_train_in = file_train_in.read().split('\n') | ||
# lines_train_clean += lines_train_in | ||
# file_train_in.close() | ||
|
||
# if 'w' in train_data_sources[lang_code]: | ||
# train_file_name = '../../preprocess_wikipedia/wikipedia_tok_norm_romanized_100k_sample_cleaned/' + lang_code + '/'+lang_code + '_indic_tok.txt' | ||
# file_train_in = open(train_file_name, 'r') | ||
# lines_train_in = file_train_in.read().split('\n') | ||
# lines_train_clean += lines_train_in | ||
# file_train_in.close() | ||
|
||
# if 'v' in train_data_sources[lang_code]: | ||
# train_file_name = '../../preprocess_vikaspedia/vikaspedia_tok_norm_romanized_100k_sample_cleaned/' + lang_code + '/'+lang_code + '_indic_tok.txt' | ||
# file_train_in = open(train_file_name, 'r') | ||
# lines_train_in = file_train_in.read().split('\n') | ||
# lines_train_clean += lines_train_in | ||
# file_train_in.close() | ||
|
||
# if 'nws' in train_data_sources[lang_code]: | ||
# train_file_name = '../../preprocess_news_crawl/news_tok_norm_romanized_100k_sample_cleaned/' + lang_code + '/'+lang_code + '_indic_tok.txt' | ||
# file_train_in = open(train_file_name, 'r') | ||
# lines_train_in = file_train_in.read().split('\n') | ||
# lines_train_clean += lines_train_in | ||
# file_train_in.close() | ||
|
||
# if 'a' in train_data_sources[lang_code]: | ||
# train_file_name = '../../Other_sources/annotator_ai4bharat_prerocess/train_tok_norm_romanized_100k_sample_cleaned/'+lang_code+'/'+lang_code+'_indic_tok.txt' | ||
# file_train_in = open(train_file_name, 'r') | ||
# lines_train_in = file_train_in.read().split('\n') | ||
# lines_train_clean += lines_train_in | ||
# file_train_in.close() | ||
|
||
|
||
# lines_train_unclean = [] | ||
|
||
# if 'i' in train_data_sources[lang_code]: | ||
# train_file_name = '../../preprocess_indiccorp/IndicCorp_data_subset_tok_norm_romanized/' + lang_code + '/'+lang_code + '_indic_tok.txt' | ||
# file_train_in = open(train_file_name, 'r') | ||
# lines_train_in = file_train_in.read().split('\n') | ||
# lines_train_unclean += lines_train_in | ||
# file_train_in.close() | ||
|
||
# if 'n' in train_data_sources[lang_code]: | ||
# train_file_name = '../../Other_sources/nllb_preprocess/nllb_data_subset_tok_norm_romanized/' + lang_code + '/'+lang_code + '_indic_tok.txt' | ||
# file_train_in = open(train_file_name, 'r') | ||
# lines_train_in = file_train_in.read().split('\n') | ||
# lines_train_unclean += lines_train_in | ||
# file_train_in.close() | ||
|
||
# if 'w' in train_data_sources[lang_code]: | ||
# train_file_name = '../../preprocess_wikipedia/wikipedia_tok_norm_romanized/' + lang_code + '/'+lang_code + '_indic_tok.txt' | ||
# file_train_in = open(train_file_name, 'r') | ||
# lines_train_in = file_train_in.read().split('\n') | ||
# lines_train_unclean += lines_train_in | ||
# file_train_in.close() | ||
|
||
# if 'v' in train_data_sources[lang_code]: | ||
# train_file_name = '../../preprocess_vikaspedia/vikaspedia_tok_norm_romanized/' + lang_code + '/'+lang_code + '_indic_tok.txt' | ||
# file_train_in = open(train_file_name, 'r') | ||
# lines_train_in = file_train_in.read().split('\n') | ||
# lines_train_unclean += lines_train_in | ||
# file_train_in.close() | ||
|
||
# if 'nws' in train_data_sources[lang_code]: | ||
# train_file_name = '../../preprocess_news_crawl/news_tok_norm_romanized/' + lang_code + '/'+lang_code + '_indic_tok.txt' | ||
# file_train_in = open(train_file_name, 'r') | ||
# lines_train_in = file_train_in.read().split('\n') | ||
# lines_train_unclean += lines_train_in | ||
# file_train_in.close() | ||
|
||
# if 'a' in train_data_sources[lang_code]: | ||
# train_file_name = '../../Other_sources/annotator_ai4bharat_prerocess/train_tok_norm_romanized/'+lang_code+'/'+lang_code+'_indic_tok.txt' | ||
# file_train_in = open(train_file_name, 'r') | ||
# lines_train_in = file_train_in.read().split('\n') | ||
# lines_train_unclean += lines_train_in | ||
# file_train_in.close() | ||
|
||
|
||
|
||
# file_in = open('../../preprocess_indiccorp/IndicCorp_data/'+lang_code+'/'+lang_code+'_combine.txt', 'r') | ||
# lines_in = file_in.read().split('\n') | ||
# file_in.close() | ||
|
||
# print('len lines_in : ', len(lines_in)) | ||
|
||
# lines_in = list(set(lines_in).difference(set(lines_train_clean))) | ||
# lines_in = list(set(lines_in).difference(set(lines_train_unclean))) | ||
|
||
# print('len lines_in : ', len(lines_in)) | ||
|
||
# lines_in = random.choices(lines_in, k = samples) | ||
|
||
# file_out = open(lang_code+'_test.txt', 'w') | ||
# file_out.write('\n'.join(lines_in)) | ||
# file_out.close() |
Oops, something went wrong.