Source code for transliteration.core

#! /usr/bin/env python
# -*- coding: utf-8 -*-
# Any Indian Language to any other Indian language transliterator
# Copyright 2009-2010 Santhosh Thottingal <santhosh.thottingal@gmail.com>
# http://www.smc.org.in
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#
# If you find any bugs or have any suggestions
# email: santhosh.thottingal@gmail.com
# URL: http://www.smc.org.in

__all__ = ['Transliterator', 'getInstance']

import string
import normalizer
from cmudict import CMUDict
import indic_en
from silpa_common.langdetect import detect_lang
from silpa_common.charmap import charmap, charmap_transphon

lang_bases = {
    'en_US': 0, 'en_IN': 0, 'hi_IN': 0x0901, 'bn_IN': 0x0981,
    'pa_IN': 0x0A01, 'gu_IN': 0x0A81, 'or_IN': 0x0B01, 'ta_IN': 0x0B81,
    'te_IN': 0x0C01, 'kn_IN': 0x0C81, 'ml_IN': 0x0D01
}


[docs]class Transliterator: """ Transliteration class, instantiate this to get access to the transliteration methods """ def __init__(self): self.cmu = CMUDict() self.normalizer = normalizer.getInstance()
[docs] def transliterate_en_ml(self, word): """ :param word: The word to be transliterated. :type word: str. :returns: the translated word. Transliterate English to Malayalam with the help of CMU pronuciation dictionary """ return self.cmu.pronunciation(word, "ml_IN")
[docs] def transliterate_en_kn(self, word): """ :param word: The word to be transliterated. :type word: str. :returns: the translated word. Transliterate English to Kannada with the help of CMU pronuciation dictionary """ return self.cmu.pronunciation(word, "kn_IN")
[docs] def transliterate_en_hi(self, word): """ :param word: The word to be transliterated. :type word: str. :returns: the translated word. Transliterate English to Hindi with the help of CMU pronuciation dictionary """ return self.cmu.pronunciation(word, "hi_IN")
[docs] def transliterate_en_xx(self, word, target_lang): """ :param word: The word to be transliterated. :type word: str. :param target_lang: The language into which word has to be transliterated. :type target_lang: str. :returns: the translated word. Transliterate English to any Indian Language. """ if target_lang == "en_IN" or target_lang == "en_US": return word if target_lang == "kn_IN": tx_str = self.transliterate_en_kn(word) return tx_str elif target_lang == "hi_IN": tx_str = self.transliterate_en_hi(word) return tx_str else: tx_str = self.transliterate_en_ml(word) if target_lang == "ml_IN": return tx_str #chain it through indic indic transliteratioin #first remove malayalam specific zwj tx_str = tx_str.replace(u'‍', '') # remove instances of zwnj if tx_str[-1:] == u'്' and \ (target_lang == "hi_IN" or target_lang == "gu_IN" or target_lang == "bn_IN"): tx_str = tx_str[:-(len(u'്'))] # remove the last virama' return self.transliterate_indic_indic(tx_str, "ml_IN", target_lang)
[docs] def transliterate_xx_en(self, word, src_lang): """ :param word: The word to be transliterated. :type word: str. :param src_lang: The language of the word. :type src_lang: str. :returns: the translated word. Transliterate Indian Language to English. """ if src_lang == "en_IN" or src_lang == "en_US": return word # TODO: the function is generic now so no need of testing the lanuguage # but since the indic_en contains only for kn_IN and ml_IN we need this # check. # Add all indic language to indic_en # remplace this block with single call to indic_en function if src_lang == "kn_IN": return self.transliterate_indic_en(word, src_lang) if not src_lang == "ml_IN": word = self.transliterate_indic_indic(word, src_lang, "ml_IN") return self.transliterate_indic_en(word, "ml_IN")
def transliterate_iso15919(self, word, src_language): tx_str = "" index = 0 word_length = len(word) for chr in word: index += 1 offset = ord(chr) - lang_bases[src_language] #76 is the virama offset for all indian languages from its base if offset >= 61 and offset <= 76: tx_str = tx_str[:-1] # remove the last 'a' if offset > 0 and offset <= 128: tx_str = tx_str + charmap["ISO15919"][offset] #delete the inherent 'a' at the end of the word from hindi if tx_str[-1:] == 'a' and (src_language == "hi_IN" or src_language == "gu_IN" or src_language == "bn_IN"): if word_length == index and word_length > 1: # if last letter tx_str = tx_str[:-1] # remove the last 'a' return tx_str .decode("utf-8")
[docs] def transliterate_ipa(self, word, src_language): """ Transliterate the given word in src_language to IPA - International Phonetical Alphabet notation. :param word: The word to be transliterated. :type word: str. :returns: the translated word. :param src_lang: The language of the word. :type src_lang: str. """ tx_str = "" index = 0 word_length = len(word) for chr in word: index += 1 if ord(chr) < 255: # ASCII characters + English tx_str += chr continue offset = ord(chr) - lang_bases[src_language] #76 is the virama offset for all indian languages from its base if offset >= 61 and offset <= 76: tx_str = tx_str[:-(len('ə'))] # remove the last 'ə' if offset > 0 and offset <= 128: tx_str = tx_str + charmap_transphon["IPA"][offset] #delete the inherent 'a' at the end of the word from hindi if tx_str[-1:] == 'ə' and \ (src_language == "hi_IN" or src_language == "gu_IN" or src_language == "bn_IN") and \ (word_length == index and word_length > 1): tx_str = tx_str[:-(len('ə'))] # if last letter # remove the last 'a' return tx_str.decode("utf-8")
def _malayalam_fixes(self, text): try: text = text.replace(u"മ് ", u"ം ") text = text.replace(u"മ്,", u"ം,") text = text.replace(u"മ്.", u"ം.") text = text.replace(u"മ്)", u"ം)") text = text.replace(u"ഩ", u"ന") text = text.replace(u"൤", u".") # danda by fullstop except: pass return text
[docs] def transliterate_indic_indic(self, word, src_lang, target_lang): """ Transliterate from an Indian languge word to another indian language word :param word: The word to be transliterated. :type word: str. :param src_lang: The language of the word. :type src_lang: str. :param target_lang: The language into which word has to be transliterated. :type target_lang: str. :returns: the translated word. """ index = 0 tx_str = "" word = self.normalizer.normalize(word) if src_lang == "ml_IN" and target_lang != "ml_IN": word = word.replace(u"\u200C", u"") word = word.replace(u"\u200D", u"") #replace all samvruthokaram by u vowels word = word.replace(u"ു്", u"") for chr in word: index += 1 if chr in string.punctuation or (ord(chr) <= 2304 and ord(chr) >= 3071): tx_str = tx_str + chr continue offset = ord(chr) + self.getOffset(src_lang, target_lang) if(offset > 0): tx_str = tx_str + unichr(offset) #schwa deletion baseoffset = offset - lang_bases[target_lang] #76 : virama if (index == len(word) and baseoffset == 76 and (target_lang == "hi_IN" or target_lang == "gu_IN" or target_lang == "pa_IN" or target_lang == "bn_IN")): #TODO Add more languages having schwa deletion characteristic tx_str = tx_str[:-(len(chr))] # remove the last 'a' if target_lang == "ml_IN" and src_lang == "ta_IN": tx_str = tx_str.replace(u"ഩ", u"ന") if target_lang == "ta_IN": tx_str = tx_str.replace(u'\u0B96', u"க") tx_str = tx_str.replace(u'\u0B97', u"க") tx_str = tx_str.replace(u'\u0B98', u"க") tx_str = tx_str.replace(u'\u0B9B', u"ச") tx_str = tx_str.replace(u'\u0B9D', u"ச") tx_str = tx_str.replace(u'\u0BA0', u"ட") tx_str = tx_str.replace(u'\u0BA1', u"ட") tx_str = tx_str.replace(u'\u0BA2', u"ட") tx_str = tx_str.replace(u'\u0BA5', u"த") tx_str = tx_str.replace(u'\u0BA6', u"த") tx_str = tx_str.replace(u'\u0BA7', u"த") tx_str = tx_str.replace(u'\u0BAB', u"ப") tx_str = tx_str.replace(u'\u0BAC', u"ப") tx_str = tx_str.replace(u'\u0BAD', u"ப") tx_str = tx_str.replace(u'\u0BC3', u"ிரு") tx_str = tx_str.replace(u'ஂ', u'ம்') #If target is malayalam, we need to add the virama if ((target_lang == "ml_IN") and (src_lang == "hi_IN" or src_lang == "gu_IN" or src_lang == "pa_IN" or src_lang == "bn_IN") and tx_str[-1].isalpha()): tx_str = tx_str + u"്" return tx_str
[docs] def transliterate_indic_en(self, word, src_lang): """ Arguments: - `self`: - `word`: Word to be transliterated (sentence) - `src_lang`: Language from which we need to transilterate """ # Get all the language related stuffs dictionary = indic_en.get_dictionary_for(src_lang) vowels = indic_en.get_vowels_for(src_lang) vowel_signs = indic_en.get_vowel_signs_for(src_lang) virama = indic_en.get_virama_for(src_lang) anuswara = indic_en.get_anuswara_for(src_lang) word_length = len(word) index = 0 tx_string = "" while index < word_length: # If current charachter is a punctuation symbol # skip it. # Added to avoid getting extra 'a' to the begining # of word next to punctuation symbol # if word[index] in string.punctuation: tx_string += word[index] index += 1 continue # Virama = conjucter if word[index] == virama: index += 1 continue # Get english equivalaent of the charachter. try: tx_string += dictionary[word[index]] except KeyError: # If charachter isn't present in the dict # just append the charachter to string # This case is now handled by punctuation checking tx_string += word[index] if index + 1 < word_length and not word[index + 1] in vowel_signs \ and word[index + 1] in dictionary \ and not word[index] in vowels \ and not word[index] in vowel_signs: tx_string += 'a' if index + 1 == word_length and not word[index] in vowel_signs \ and word[index] in dictionary: tx_string += 'a' #handle am sign if index + 1 < word_length and word[index + 1] == anuswara \ and not word[index] in vowel_signs: tx_string += 'a' index += 1 return tx_string
[docs] def transliterate(self, text, target_lang_code): """ :param text: The text to be transliterated. :type text: str. :param target_lang_code: The language into which word has to be transliterated. :type target_lang_code: str. :returns: the transliterated text. The transliteration functioon which can transliterate text to the supported target languages. """ tx_str = "" lines = text.split("\n") for line in lines: words = line.split(" ") for word in words: if(word.strip() > ""): try: src_lang_code = detect_lang(word)[word] except: tx_str = tx_str + " " + word continue # FIXME if target_lang_code == "ISO15919": tx_str = (tx_str + self.transliterate_iso15919(word, src_lang_code) + " ") continue if target_lang_code == "IPA": tx_str = (tx_str + self.transliterate_ipa(word, src_lang_code) + " ") continue if src_lang_code == "en_US": tx_str = (tx_str + self.transliterate_en_xx(word, target_lang_code) + " ") continue if target_lang_code == "en_US" or \ target_lang_code == "en_IN": tx_str = (tx_str + self.transliterate_xx_en(word, src_lang_code) + " ") continue tx_str += self.transliterate_indic_indic(word, src_lang_code, target_lang_code) if len(line) > 1: tx_str += " " else: tx_str = tx_str + word if len(lines) > 1: tx_str += "\n" # Language specific fixes if target_lang_code == "ml_IN": tx_str = self._malayalam_fixes(tx_str) return tx_str
def getOffset(self, src, target): src_id = 0 target_id = 0 try: src_id = lang_bases[src] target_id = lang_bases[target] return (target_id - src_id) except: return 0
[docs] def get_module_name(self): """ returns module name """ return "Transliterator"
[docs] def get_info(self): """ Returns module info """ return "Transliterate the text between any Indian Language"
[docs]def getInstance(): """ returns instance of :class: `Transliterator` """ return Transliterator()