Skip to content

Commit

Permalink
uploading scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
yashmadhani97 committed May 22, 2023
1 parent ccedb3e commit 2e2229b
Show file tree
Hide file tree
Showing 204 changed files with 55,897 additions and 0 deletions.
72 changes: 72 additions & 0 deletions Benchmark/compile_final_pilot_1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import pandas as pd

native_data_dict = {
'as' : [],
'brx' : [],
'gom' : [],
'ks' : [],
'mai' : [],
'mni' : [],
'ne' : [],
'or' : [],
'sa' : [],
}

roman_data_dict = {
'as' : [],
'brx' : [],
'gom' : [],
'ks' : [],
'mai' : [],
'mni' : [],
'ne' : [],
'or' : [],
'sa' : [],
}

# load from benchmark_reports_pilot_1
lang_code_list = ['as', 'brx', 'gom', 'ks', 'mai', 'mni', 'ne', 'or', 'sa']

for lang_code in lang_code_list:
sheet = pd.read_csv('../benchmark_reports_pilot_1/'+lang_code+'.csv')

sheet = sheet[['input_text', 'output_text']]

sheet = sheet.values.tolist()

for line in sheet:
if line[0] not in native_data_dict[lang_code]:
native_data_dict[lang_code].append(line[0])
roman_data_dict[lang_code].append(line[1])

# load from extra_pilot_1
xlsx_file_name = '../extra_pilot_1/Extra_sens_pilot_1_combined_final.xlsx'

sheet_names = ['as', 'brx', 'ks', 'or', 'mai', 'sa']

for sheet_name in sheet_names:
sheet = pd.ExcelFile(xlsx_file_name).parse(sheet_name)
# print(sheet)

sheet = sheet.values.tolist()

for line in sheet:
if line[0] not in native_data_dict[lang_code]:
native_data_dict[sheet_name].append(line[0])
roman_data_dict[sheet_name].append(line[1])

for lang_code in native_data_dict:
print('lang : ', lang_code)
print('native : ', len(native_data_dict[lang_code][:512]))
print('roman : ', len(roman_data_dict[lang_code][:512]))


for lang_code in native_data_dict:
file = open('native_script/'+lang_code+'_native.txt', 'w')
file.write('\n'.join(native_data_dict[lang_code][:512]))
file.close()

file = open('roman_script/'+lang_code+'_roman.txt', 'w')
file.write('\n'.join(roman_data_dict[lang_code][:512]))
file.close()

262 changes: 262 additions & 0 deletions Benchmark/create_benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,262 @@
import random


train_data_sources = {
'as' : ['i', 'w', 'v'],
'bn' : ['i', 'w', 'v'],
'brx' : ['i', 'n', 'a', 'v'],
'dg' : ['i', 'v'],
'gom' : ['n', 'a', 'w', 'v'],
'gu' : ['i', 'w', 'v'],
'hi' : ['i', 'w', 'v'],
'kn' : ['i', 'w', 'v'],
'mai' : ['i', 'a', 'w', 'v'],
'ml' : ['i', 'w', 'v'],
'mr' : ['i', 'w', 'v'],
'ne' : ['i', 'a', 'w', 'v'],
'or' : ['i', 'w', 'v'],
'pa' : ['i', 'w', 'v'],
'sa' : ['i', 'n', 'a', 'w', 'v'],
'sat' : ['i', 'n', 'w', 'v'],
'sd' : ['i', 'w'],
'ta' : ['i', 'w', 'v'],
'te' : ['i', 'w', 'v'],
'ur' : ['i', 'a'],
'ks_arab' : ['n', 'a'],
'ks_deva' : ['n'],
'mni_beng' : ['n'],
'mni_mei' : ['i', 'a', 'w'],
'en' : ['i'],
'other' : ['nws']
}


samples = 512

lang_code_list = ['dg']


for lang_code in lang_code_list:

lines_train_clean = []

if 'i' in train_data_sources[lang_code]:
train_file_name = '../../preprocess_indiccorp/IndicCorp_data_subset_tok_norm_romanized_100k_sample_cleaned/' + lang_code + '/'+lang_code + '_indic_tok.txt'
file_train_in = open(train_file_name, 'r')
lines_train_in = file_train_in.read().split('\n')
lines_train_clean += lines_train_in
file_train_in.close()

if 'n' in train_data_sources[lang_code]:
train_file_name = '../../Other_sources/nllb_preprocess/nllb_data_subset_tok_norm_romanized_100k_sample_cleaned/' + lang_code + '/'+lang_code + '_indic_tok.txt'
file_train_in = open(train_file_name, 'r')
lines_train_in = file_train_in.read().split('\n')
lines_train_clean += lines_train_in
file_train_in.close()

if 'w' in train_data_sources[lang_code]:
train_file_name = '../../preprocess_wikipedia/wikipedia_tok_norm_romanized_100k_sample_cleaned/' + lang_code + '/'+lang_code + '_indic_tok.txt'
file_train_in = open(train_file_name, 'r')
lines_train_in = file_train_in.read().split('\n')
lines_train_clean += lines_train_in
file_train_in.close()

if 'v' in train_data_sources[lang_code]:
train_file_name = '../../preprocess_vikaspedia/vikaspedia_tok_norm_romanized_100k_sample_cleaned/' + lang_code + '/'+lang_code + '_indic_tok.txt'
file_train_in = open(train_file_name, 'r')
lines_train_in = file_train_in.read().split('\n')
lines_train_clean += lines_train_in
file_train_in.close()

if 'nws' in train_data_sources[lang_code]:
train_file_name = '../../preprocess_news_crawl/news_tok_norm_romanized_100k_sample_cleaned/' + lang_code + '/'+lang_code + '_indic_tok.txt'
file_train_in = open(train_file_name, 'r')
lines_train_in = file_train_in.read().split('\n')
lines_train_clean += lines_train_in
file_train_in.close()

if 'a' in train_data_sources[lang_code]:
train_file_name = '../../Other_sources/annotator_ai4bharat_prerocess/train_tok_norm_romanized_100k_sample_cleaned/'+lang_code+'/'+lang_code+'_indic_tok.txt'
file_train_in = open(train_file_name, 'r')
lines_train_in = file_train_in.read().split('\n')
lines_train_clean += lines_train_in
file_train_in.close()


lines_train_unclean = []

if 'i' in train_data_sources[lang_code]:
train_file_name = '../../preprocess_indiccorp/IndicCorp_data_subset_tok_norm_romanized/' + lang_code + '/'+lang_code + '_indic_tok.txt'
file_train_in = open(train_file_name, 'r')
lines_train_in = file_train_in.read().split('\n')
lines_train_unclean += lines_train_in
file_train_in.close()

if 'n' in train_data_sources[lang_code]:
train_file_name = '../../Other_sources/nllb_preprocess/nllb_data_subset_tok_norm_romanized/' + lang_code + '/'+lang_code + '_indic_tok.txt'
file_train_in = open(train_file_name, 'r')
lines_train_in = file_train_in.read().split('\n')
lines_train_unclean += lines_train_in
file_train_in.close()

if 'w' in train_data_sources[lang_code]:
train_file_name = '../../preprocess_wikipedia/wikipedia_tok_norm_romanized/' + lang_code + '/'+lang_code + '_indic_tok.txt'
file_train_in = open(train_file_name, 'r')
lines_train_in = file_train_in.read().split('\n')
lines_train_unclean += lines_train_in
file_train_in.close()

if 'v' in train_data_sources[lang_code]:
train_file_name = '../../preprocess_vikaspedia/vikaspedia_tok_norm_romanized/' + lang_code + '/'+lang_code + '_indic_tok.txt'
file_train_in = open(train_file_name, 'r')
lines_train_in = file_train_in.read().split('\n')
lines_train_unclean += lines_train_in
file_train_in.close()

if 'nws' in train_data_sources[lang_code]:
train_file_name = '../../preprocess_news_crawl/news_tok_norm_romanized/' + lang_code + '/'+lang_code + '_indic_tok.txt'
file_train_in = open(train_file_name, 'r')
lines_train_in = file_train_in.read().split('\n')
lines_train_unclean += lines_train_in
file_train_in.close()

if 'a' in train_data_sources[lang_code]:
train_file_name = '../../Other_sources/annotator_ai4bharat_prerocess/train_tok_norm_romanized/'+lang_code+'/'+lang_code+'_indic_tok.txt'
file_train_in = open(train_file_name, 'r')
lines_train_in = file_train_in.read().split('\n')
lines_train_unclean += lines_train_in
file_train_in.close()




file_in = open('../../Other_sources/annotator_ai4bharat_train_test_split/test/'+lang_code+'_test.txt', 'r')
lines_in = file_in.read().split('\n')
file_in.close()

print('len lines_in : ', len(lines_in))

lines_in = list(set(lines_in).difference(set(lines_train_clean)))
lines_in = list(set(lines_in).difference(set(lines_train_unclean)))

print('len lines_in : ', len(lines_in))

lines_in = random.choices(lines_in, k = samples)

file_out = open(lang_code+'_test.txt', 'w')
file_out.write('\n'.join(lines_in))
file_out.close()


# lang_code_list = ['as', 'or']

# for lang_code in lang_code_list:


# lines_train_clean = []

# if 'i' in train_data_sources[lang_code]:
# train_file_name = '../../preprocess_indiccorp/IndicCorp_data_subset_tok_norm_romanized_100k_sample_cleaned/' + lang_code + '/'+lang_code + '_indic_tok.txt'
# file_train_in = open(train_file_name, 'r')
# lines_train_in = file_train_in.read().split('\n')
# lines_train_clean += lines_train_in
# file_train_in.close()

# if 'n' in train_data_sources[lang_code]:
# train_file_name = '../../Other_sources/nllb_preprocess/nllb_data_subset_tok_norm_romanized_100k_sample_cleaned/' + lang_code + '/'+lang_code + '_indic_tok.txt'
# file_train_in = open(train_file_name, 'r')
# lines_train_in = file_train_in.read().split('\n')
# lines_train_clean += lines_train_in
# file_train_in.close()

# if 'w' in train_data_sources[lang_code]:
# train_file_name = '../../preprocess_wikipedia/wikipedia_tok_norm_romanized_100k_sample_cleaned/' + lang_code + '/'+lang_code + '_indic_tok.txt'
# file_train_in = open(train_file_name, 'r')
# lines_train_in = file_train_in.read().split('\n')
# lines_train_clean += lines_train_in
# file_train_in.close()

# if 'v' in train_data_sources[lang_code]:
# train_file_name = '../../preprocess_vikaspedia/vikaspedia_tok_norm_romanized_100k_sample_cleaned/' + lang_code + '/'+lang_code + '_indic_tok.txt'
# file_train_in = open(train_file_name, 'r')
# lines_train_in = file_train_in.read().split('\n')
# lines_train_clean += lines_train_in
# file_train_in.close()

# if 'nws' in train_data_sources[lang_code]:
# train_file_name = '../../preprocess_news_crawl/news_tok_norm_romanized_100k_sample_cleaned/' + lang_code + '/'+lang_code + '_indic_tok.txt'
# file_train_in = open(train_file_name, 'r')
# lines_train_in = file_train_in.read().split('\n')
# lines_train_clean += lines_train_in
# file_train_in.close()

# if 'a' in train_data_sources[lang_code]:
# train_file_name = '../../Other_sources/annotator_ai4bharat_prerocess/train_tok_norm_romanized_100k_sample_cleaned/'+lang_code+'/'+lang_code+'_indic_tok.txt'
# file_train_in = open(train_file_name, 'r')
# lines_train_in = file_train_in.read().split('\n')
# lines_train_clean += lines_train_in
# file_train_in.close()


# lines_train_unclean = []

# if 'i' in train_data_sources[lang_code]:
# train_file_name = '../../preprocess_indiccorp/IndicCorp_data_subset_tok_norm_romanized/' + lang_code + '/'+lang_code + '_indic_tok.txt'
# file_train_in = open(train_file_name, 'r')
# lines_train_in = file_train_in.read().split('\n')
# lines_train_unclean += lines_train_in
# file_train_in.close()

# if 'n' in train_data_sources[lang_code]:
# train_file_name = '../../Other_sources/nllb_preprocess/nllb_data_subset_tok_norm_romanized/' + lang_code + '/'+lang_code + '_indic_tok.txt'
# file_train_in = open(train_file_name, 'r')
# lines_train_in = file_train_in.read().split('\n')
# lines_train_unclean += lines_train_in
# file_train_in.close()

# if 'w' in train_data_sources[lang_code]:
# train_file_name = '../../preprocess_wikipedia/wikipedia_tok_norm_romanized/' + lang_code + '/'+lang_code + '_indic_tok.txt'
# file_train_in = open(train_file_name, 'r')
# lines_train_in = file_train_in.read().split('\n')
# lines_train_unclean += lines_train_in
# file_train_in.close()

# if 'v' in train_data_sources[lang_code]:
# train_file_name = '../../preprocess_vikaspedia/vikaspedia_tok_norm_romanized/' + lang_code + '/'+lang_code + '_indic_tok.txt'
# file_train_in = open(train_file_name, 'r')
# lines_train_in = file_train_in.read().split('\n')
# lines_train_unclean += lines_train_in
# file_train_in.close()

# if 'nws' in train_data_sources[lang_code]:
# train_file_name = '../../preprocess_news_crawl/news_tok_norm_romanized/' + lang_code + '/'+lang_code + '_indic_tok.txt'
# file_train_in = open(train_file_name, 'r')
# lines_train_in = file_train_in.read().split('\n')
# lines_train_unclean += lines_train_in
# file_train_in.close()

# if 'a' in train_data_sources[lang_code]:
# train_file_name = '../../Other_sources/annotator_ai4bharat_prerocess/train_tok_norm_romanized/'+lang_code+'/'+lang_code+'_indic_tok.txt'
# file_train_in = open(train_file_name, 'r')
# lines_train_in = file_train_in.read().split('\n')
# lines_train_unclean += lines_train_in
# file_train_in.close()



# file_in = open('../../preprocess_indiccorp/IndicCorp_data/'+lang_code+'/'+lang_code+'_combine.txt', 'r')
# lines_in = file_in.read().split('\n')
# file_in.close()

# print('len lines_in : ', len(lines_in))

# lines_in = list(set(lines_in).difference(set(lines_train_clean)))
# lines_in = list(set(lines_in).difference(set(lines_train_unclean)))

# print('len lines_in : ', len(lines_in))

# lines_in = random.choices(lines_in, k = samples)

# file_out = open(lang_code+'_test.txt', 'w')
# file_out.write('\n'.join(lines_in))
# file_out.close()
Loading

0 comments on commit 2e2229b

Please sign in to comment.