Skip to content

Files

Latest commit

9723117 · Dec 6, 2022

History

History
This branch is up to date with ashimdahal/jelly:main.

nlp

Folders and files

NameName
Last commit message
Last commit date

parent directory

..
Jun 16, 2022
Dec 6, 2022
Jun 10, 2022
Jun 10, 2022
Jun 18, 2022

This is a small branch of NEPALI_NLP repo by Sushil Ghimire. His documentation on the module can be fould below. ALTHOUGH, we have only used the unicode and spell correction modules for this.

Getting the module

!pip install git+https://github.com/sushil79g/Nepali_nlp.git

Loading Embedding

from Nepali_nlp import Embeddings
word_vec = Embeddings().load_large_vector()
#word_vec = Embeddings().load_vector() #For small Embedding
#from fasttext_embedding import Fasttext
#word_vec = Fasttext().load()

For Nepali Synonym

from Nepali_nlp import Synonym
Synonym().raw_synonym(word = 'माया',word_vec=word_vec) #method: 1
#output -> स्नेह','प्रेम','आदर','मायाँ','दया','मायालु','श्रद्धा','आत्मियता','स्पर्श','तिमी
Synonym().filter_synonym(word = 'साथी',word_vec=word_vec) #method: 2
#output -> 'भाइहरू','सहपाठी','प्रेमी','दाइ','प्रेमि','बहिनी'

Word-spell corrector

from Nepali_nlp import Corrector
Corrector().corrector(word='सुशल') #In a very raw stage for now.
#output-> ['सुशील', 'सुशील']
Corrector().spell_correct("कस्त भको हेरौ है")
#output-> "कस्तो भयो हेर है"

Nepali text summerizer

from Nepali_nlp import Summerize
Summerize().show_summary(word_vec,text, length_sentence_predict=5)

Nepali unicode to Devnagiri Font

from Nepali_nlp import Unicode
text = 'ma ghara jaanchhu'
Unicode().unicode_word(text) #output-> 'म घर जान्छु'

Preeti-font character to Devnagiri Font

from Nepali_nlp import preeti
unicode_word = 'g]kfnL'
print(preeti(unicode_word)) #output-> नेपाली

OCR(optical character reader)

from Nepali_nlp import OCR
text = OCR(image_location)

Nepali Tokenizer

from Nepali_nlp import Tokenizer
Tokenizer().sentence_tokenize(text) #To tokenize sentence
Tokenizer().word_tokenize(text) #To tokenize word
Tokenizer().character_tokenize(text) #To tokenize character
Tokenizer().sentencepeice_tokenize(text) #Tokenize using BPE

Nepali Stemming

from Nepali_nlp import Stem
text = "सरकारका प्रवक्ता प्रदीप ज्ञवालीले पनि गत बिहीबार उनलाई अनशन तोड्न आग्रह गरेका थिए" #str or list of word
Stem().rootify(text)
#output -> ['सरकार','प्रवक्ता','प्रदीप','ज्ञवाली','पनि','गत','बिहीबार','उन','अनशन','तोड्न','आग्रह','गर','']

Nepali sentence similarity

from Nepali_nlp import  Avg_vector_similar
sentences = ["कुपोषणकै कारण शारीरिक र मानसिक रुपमा कमजोर मात्र होइन, अकालमै ज्यान पनि गुमाउनुको परेको समाचार बग्रेल्ती सुन्न सकिन्छ","कर्णाली प्रदेश सामाजिक विकास मन्त्रालयले उपलब्ध गराएको तथ्यांकले कर्णालीमा प्रत्येक वर्ष जन्मिएका ५ वर्षमुनीका बालबालिका १ हजार जनामध्ये ५८ जनाले ज्यान गुमाउँदै आएको देखाएको छ"]
Avg_vector_similar().pair_similarity(word_vec, sentences) #output-> 0.6817289590835571

Nepali new-portal Scrapper (onlinekhabar and ekantipur for now)

from Nepali_nlp import extract_news
news_link = 'https://www.onlinekhabar.com/2019/12/821094'
title, news = extract_news(news_link) #onlinekhabar and ekantipur is supported at the moment.

Show latest news summary

from Nepali_nlp import UpdateNews
title, links, summerized_news = UpdateNews().show_latest(word_vec=word_vec,portal='onlinekhabar',number_of_news=5) #ekantipur portal is also supported

Language Translation (English--Nepali)

from Nepali_nlp import LanguageTranslation
conv = LanguageTranslation()
nepali_text = "प्रधानमन्त्री निवास बालुवाटारमा आज बिहान बसेको बैठकमा आसन्न स्थानीय तहको निर्वाचनको विषयमा छलफल भएको थियो । छलफलपछि सरकारका प्रवक्ता ज्ञानेन्द्रबहादुर कार्कीले निर्वाचन समयमै सम्पन्न गर्न आफूहरु सहमत भएको सन्चारकर्मीलाई जानकारी दिए ।"
english_text = "You already know that a data class is just a regular class. That means that you can freely add your own methods to a data class. As an example, let us calculate the distance between one position and another, along the Earth’s surface"

output = conv.to_nepali(english_text) #output-> ['तपाईँले पहिले नै जान्नु भएको छ कि डेटा वर्ग एउटा नियमित वर्ग मात्र हो । यसको अर्थ तपाईँले डेटा वर्गमा तपाईँको आफ्नै विधिहरू स्वतन्त्र रूपमा थप्न सक्नुहुन्छ । उदाहरणका लागि, हामीलाई पृथ्वीको सतहमा एक स्थान र अर्को बीचको दूरी गणना गर्नुहोस्']

output = conv.to_english(nepali_text) #output-> ['After the meeting, government spokesman Gyanendra Bahadur Patel informed the media that they agreed to hold elections on time.']

TODOs:

  • Nepali Embeddings
  • Tokenizers (sentence, word, character)
  • Stop Words
  • Nepali Words Collection
  • Nepali Word synonym
  • Roman Nepali to Nepali
  • Nepali OCR
  • Summerization
  • Pos_tag
  • Nepali stemming
  • Sentence similarity score
  • Spell correction
  • Translation (Nepali<->English)
  • Named Entity Recognition (Currently)