#! /usr/bin/env python
# -*- coding: utf-8 -*-
# Any Indian Language to any other Indian language transliterator
# Copyright 2009-2010 Santhosh Thottingal <santhosh.thottingal@gmail.com>
# http://www.smc.org.in
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#
# If you find any bugs or have any suggestions
# email: santhosh.thottingal@gmail.com
# URL: http://www.smc.org.in
__all__ = ['Transliterator', 'getInstance']
import string
import normalizer
from cmudict import CMUDict
import indic_en
from silpa_common.langdetect import detect_lang
from silpa_common.charmap import charmap, charmap_transphon
lang_bases = {
'en_US': 0, 'en_IN': 0, 'hi_IN': 0x0901, 'bn_IN': 0x0981,
'pa_IN': 0x0A01, 'gu_IN': 0x0A81, 'or_IN': 0x0B01, 'ta_IN': 0x0B81,
'te_IN': 0x0C01, 'kn_IN': 0x0C81, 'ml_IN': 0x0D01
}
[docs]class Transliterator:
"""
Transliteration class, instantiate this to get access to the transliteration methods
"""
def __init__(self):
self.cmu = CMUDict()
self.normalizer = normalizer.getInstance()
[docs] def transliterate_en_ml(self, word):
"""
:param word: The word to be transliterated.
:type word: str.
:returns: the translated word.
Transliterate English to Malayalam with the help of
CMU pronuciation dictionary
"""
return self.cmu.pronunciation(word, "ml_IN")
[docs] def transliterate_en_kn(self, word):
"""
:param word: The word to be transliterated.
:type word: str.
:returns: the translated word.
Transliterate English to Kannada with the help of
CMU pronuciation dictionary
"""
return self.cmu.pronunciation(word, "kn_IN")
[docs] def transliterate_en_hi(self, word):
"""
:param word: The word to be transliterated.
:type word: str.
:returns: the translated word.
Transliterate English to Hindi with the help of
CMU pronuciation dictionary
"""
return self.cmu.pronunciation(word, "hi_IN")
[docs] def transliterate_en_xx(self, word, target_lang):
"""
:param word: The word to be transliterated.
:type word: str.
:param target_lang: The language into which word has to be transliterated.
:type target_lang: str.
:returns: the translated word.
Transliterate English to any Indian Language.
"""
if target_lang == "en_IN" or target_lang == "en_US":
return word
if target_lang == "kn_IN":
tx_str = self.transliterate_en_kn(word)
return tx_str
elif target_lang == "hi_IN":
tx_str = self.transliterate_en_hi(word)
return tx_str
else:
tx_str = self.transliterate_en_ml(word)
if target_lang == "ml_IN":
return tx_str
#chain it through indic indic transliteratioin
#first remove malayalam specific zwj
tx_str = tx_str.replace(u'', '') # remove instances of zwnj
if tx_str[-1:] == u'്' and \
(target_lang == "hi_IN"
or target_lang == "gu_IN"
or target_lang == "bn_IN"): tx_str = tx_str[:-(len(u'്'))]
# remove the last virama'
return self.transliterate_indic_indic(tx_str, "ml_IN", target_lang)
[docs] def transliterate_xx_en(self, word, src_lang):
"""
:param word: The word to be transliterated.
:type word: str.
:param src_lang: The language of the word.
:type src_lang: str.
:returns: the translated word.
Transliterate Indian Language to English.
"""
if src_lang == "en_IN" or src_lang == "en_US":
return word
# TODO: the function is generic now so no need of testing the lanuguage
# but since the indic_en contains only for kn_IN and ml_IN we need this
# check.
# Add all indic language to indic_en
# remplace this block with single call to indic_en function
if src_lang == "kn_IN":
return self.transliterate_indic_en(word, src_lang)
if not src_lang == "ml_IN":
word = self.transliterate_indic_indic(word, src_lang, "ml_IN")
return self.transliterate_indic_en(word, "ml_IN")
def transliterate_iso15919(self, word, src_language):
tx_str = ""
index = 0
word_length = len(word)
for chr in word:
index += 1
offset = ord(chr) - lang_bases[src_language]
#76 is the virama offset for all indian languages from its base
if offset >= 61 and offset <= 76:
tx_str = tx_str[:-1] # remove the last 'a'
if offset > 0 and offset <= 128:
tx_str = tx_str + charmap["ISO15919"][offset]
#delete the inherent 'a' at the end of the word from hindi
if tx_str[-1:] == 'a' and (src_language == "hi_IN"
or src_language == "gu_IN"
or src_language == "bn_IN"):
if word_length == index and word_length > 1: # if last letter
tx_str = tx_str[:-1] # remove the last 'a'
return tx_str .decode("utf-8")
[docs] def transliterate_ipa(self, word, src_language):
"""
Transliterate the given word in src_language to
IPA - International Phonetical Alphabet notation.
:param word: The word to be transliterated.
:type word: str.
:returns: the translated word.
:param src_lang: The language of the word.
:type src_lang: str.
"""
tx_str = ""
index = 0
word_length = len(word)
for chr in word:
index += 1
if ord(chr) < 255: # ASCII characters + English
tx_str += chr
continue
offset = ord(chr) - lang_bases[src_language]
#76 is the virama offset for all indian languages from its base
if offset >= 61 and offset <= 76:
tx_str = tx_str[:-(len('ə'))] # remove the last 'ə'
if offset > 0 and offset <= 128:
tx_str = tx_str + charmap_transphon["IPA"][offset]
#delete the inherent 'a' at the end of the word from hindi
if tx_str[-1:] == 'ə' and \
(src_language == "hi_IN"
or src_language == "gu_IN"
or src_language == "bn_IN") and \
(word_length == index
and word_length > 1): tx_str = tx_str[:-(len('ə'))]
# if last letter
# remove the last 'a'
return tx_str.decode("utf-8")
def _malayalam_fixes(self, text):
try:
text = text.replace(u"മ് ", u"ം ")
text = text.replace(u"മ്,", u"ം,")
text = text.replace(u"മ്.", u"ം.")
text = text.replace(u"മ്)", u"ം)")
text = text.replace(u"ഩ", u"ന")
text = text.replace(u"", u".") # danda by fullstop
except:
pass
return text
[docs] def transliterate_indic_indic(self, word, src_lang, target_lang):
"""
Transliterate from an Indian languge word
to another indian language word
:param word: The word to be transliterated.
:type word: str.
:param src_lang: The language of the word.
:type src_lang: str.
:param target_lang: The language into which word has to be transliterated.
:type target_lang: str.
:returns: the translated word.
"""
index = 0
tx_str = ""
word = self.normalizer.normalize(word)
if src_lang == "ml_IN" and target_lang != "ml_IN":
word = word.replace(u"\u200C", u"")
word = word.replace(u"\u200D", u"")
#replace all samvruthokaram by u vowels
word = word.replace(u"ു്", u"")
for chr in word:
index += 1
if chr in string.punctuation or (ord(chr) <= 2304
and ord(chr) >= 3071):
tx_str = tx_str + chr
continue
offset = ord(chr) + self.getOffset(src_lang, target_lang)
if(offset > 0):
tx_str = tx_str + unichr(offset)
#schwa deletion
baseoffset = offset - lang_bases[target_lang]
#76 : virama
if (index == len(word) and baseoffset == 76
and (target_lang == "hi_IN"
or target_lang == "gu_IN"
or target_lang == "pa_IN"
or target_lang == "bn_IN")):
#TODO Add more languages having schwa deletion characteristic
tx_str = tx_str[:-(len(chr))] # remove the last 'a'
if target_lang == "ml_IN" and src_lang == "ta_IN":
tx_str = tx_str.replace(u"ഩ", u"ന")
if target_lang == "ta_IN":
tx_str = tx_str.replace(u'\u0B96', u"க")
tx_str = tx_str.replace(u'\u0B97', u"க")
tx_str = tx_str.replace(u'\u0B98', u"க")
tx_str = tx_str.replace(u'\u0B9B', u"ச")
tx_str = tx_str.replace(u'\u0B9D', u"ச")
tx_str = tx_str.replace(u'\u0BA0', u"ட")
tx_str = tx_str.replace(u'\u0BA1', u"ட")
tx_str = tx_str.replace(u'\u0BA2', u"ட")
tx_str = tx_str.replace(u'\u0BA5', u"த")
tx_str = tx_str.replace(u'\u0BA6', u"த")
tx_str = tx_str.replace(u'\u0BA7', u"த")
tx_str = tx_str.replace(u'\u0BAB', u"ப")
tx_str = tx_str.replace(u'\u0BAC', u"ப")
tx_str = tx_str.replace(u'\u0BAD', u"ப")
tx_str = tx_str.replace(u'\u0BC3', u"ிரு")
tx_str = tx_str.replace(u'ஂ', u'ம்')
#If target is malayalam, we need to add the virama
if ((target_lang == "ml_IN") and
(src_lang == "hi_IN"
or src_lang == "gu_IN"
or src_lang == "pa_IN"
or src_lang == "bn_IN") and tx_str[-1].isalpha()):
tx_str = tx_str + u"്"
return tx_str
[docs] def transliterate_indic_en(self, word, src_lang):
"""
Arguments:
- `self`:
- `word`: Word to be transliterated (sentence)
- `src_lang`: Language from which we need to transilterate
"""
# Get all the language related stuffs
dictionary = indic_en.get_dictionary_for(src_lang)
vowels = indic_en.get_vowels_for(src_lang)
vowel_signs = indic_en.get_vowel_signs_for(src_lang)
virama = indic_en.get_virama_for(src_lang)
anuswara = indic_en.get_anuswara_for(src_lang)
word_length = len(word)
index = 0
tx_string = ""
while index < word_length:
# If current charachter is a punctuation symbol
# skip it.
# Added to avoid getting extra 'a' to the begining
# of word next to punctuation symbol
#
if word[index] in string.punctuation:
tx_string += word[index]
index += 1
continue
# Virama = conjucter
if word[index] == virama:
index += 1
continue
# Get english equivalaent of the charachter.
try:
tx_string += dictionary[word[index]]
except KeyError:
# If charachter isn't present in the dict
# just append the charachter to string
# This case is now handled by punctuation checking
tx_string += word[index]
if index + 1 < word_length and not word[index + 1] in vowel_signs \
and word[index + 1] in dictionary \
and not word[index] in vowels \
and not word[index] in vowel_signs:
tx_string += 'a'
if index + 1 == word_length and not word[index] in vowel_signs \
and word[index] in dictionary:
tx_string += 'a'
#handle am sign
if index + 1 < word_length and word[index + 1] == anuswara \
and not word[index] in vowel_signs:
tx_string += 'a'
index += 1
return tx_string
[docs] def transliterate(self, text, target_lang_code):
"""
:param text: The text to be transliterated.
:type text: str.
:param target_lang_code: The language into which word has to be transliterated.
:type target_lang_code: str.
:returns: the transliterated text.
The transliteration functioon which can transliterate text to the
supported target languages.
"""
tx_str = ""
lines = text.split("\n")
for line in lines:
words = line.split(" ")
for word in words:
if(word.strip() > ""):
try:
src_lang_code = detect_lang(word)[word]
except:
tx_str = tx_str + " " + word
continue # FIXME
if target_lang_code == "ISO15919":
tx_str = (tx_str
+ self.transliterate_iso15919(word,
src_lang_code)
+ " ")
continue
if target_lang_code == "IPA":
tx_str = (tx_str
+ self.transliterate_ipa(word,
src_lang_code)
+ " ")
continue
if src_lang_code == "en_US":
tx_str = (tx_str
+ self.transliterate_en_xx(word,
target_lang_code)
+ " ")
continue
if target_lang_code == "en_US" or \
target_lang_code == "en_IN":
tx_str = (tx_str
+ self.transliterate_xx_en(word,
src_lang_code)
+ " ")
continue
tx_str += self.transliterate_indic_indic(word,
src_lang_code,
target_lang_code)
if len(line) > 1:
tx_str += " "
else:
tx_str = tx_str + word
if len(lines) > 1:
tx_str += "\n"
# Language specific fixes
if target_lang_code == "ml_IN":
tx_str = self._malayalam_fixes(tx_str)
return tx_str
def getOffset(self, src, target):
src_id = 0
target_id = 0
try:
src_id = lang_bases[src]
target_id = lang_bases[target]
return (target_id - src_id)
except:
return 0
[docs] def get_module_name(self):
"""
returns module name
"""
return "Transliterator"
[docs] def get_info(self):
"""
Returns module info
"""
return "Transliterate the text between any Indian Language"
[docs]def getInstance():
"""
returns instance of :class: `Transliterator`
"""
return Transliterator()