#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Sanskrit Text Utility"""
###############################################################################
__author__ = """Hrishikesh Terdalkar"""
__email__ = "hrishikeshrt@linuxmail.org"
__version__ = "1.0.0"
__created_at__ = "Tue Apr 17 22:20:39 2018"
###############################################################################
import re
import logging
from collections import defaultdict
from itertools import product
from typing import Dict, List, Tuple
###############################################################################
LOGGER = logging.getLogger(__name__)
###############################################################################
[docs]def ord_unicode(ch: str) -> str:
"""Get Unicode 4-character-identifier corresponding to a character
Parameters
----------
ch : str
Single character
Returns
-------
str
4-character unicode identifier
"""
return hex(ord(ch)).split("x")[1].zfill(4)
[docs]def chr_unicode(u: str) -> str:
"""Get a Unicode character corresponding to 4-chracater identifier
Parameters
----------
u : str
4-character unicode identifier
Returns
-------
str
Single character
"""
return chr(int(u, 16))
###############################################################################
DESIGN = """
* len(SWARA) == len(MATRA) + 1 # 'अ' is extra at the beginning
* len(EXTENDED_SWARA) == len(EXTENDED_MATRA) + 1 # 'ऍ' is extra at the end
* It is unclear which of 'ॲ' or 'ऍ' should correspond to 'ॅ', current choice is
the former. If that changes, the order in EXTENDED_MATRA would need to change
* ARTIFICIAL_MATRA contains absent vowel signs. Currently this is just for 'अ'.
Any new sign should follow the pattern as hyphen ('-') followed by the vowel
letter (e.g. '-अ').
* DIGITS, COMBINING_DIGIT_MARKS, PUNCTUATION and GENERAL_PUNCTUATION aren't
part of the ALPHABET. Their inclusion needs more deliberation.
* VEDIC_MARKS are not used in syllabification functions currently.
"""
###############################################################################
# Alphabet of Sanskrit
SWARA = ["अ", "आ", "इ", "ई", "उ", "ऊ", "ऋ", "ॠ", "ऌ", "ॡ", "ए", "ऐ", "ओ", "औ"]
EXTENDED_SWARA = ["ऎ", "ऒ", "ॲ", "ऑ", "ऍ"]
MATRA = ["ा", "ि", "ी", "ु", "ू", "ृ", "ॄ", "ॢ", "ॣ", "े", "ै", "ो", "ौ"]
EXTENDED_MATRA = ["ॆ", "ॊ", "ॅ", "ॉ"]
KANTHYA = ["क", "ख", "ग", "घ", "ङ"]
TALAVYA = ["च", "छ", "ज", "झ", "ञ"]
MURDHANYA = ["ट", "ठ", "ड", "ढ", "ण"]
DANTYA = ["त", "थ", "द", "ध", "न"]
AUSHTHYA = ["प", "फ", "ब", "भ", "म"]
ANTAHSTHA = ["य", "र", "ल", "व"]
USHMA = ["श", "ष", "स", "ह"]
VISHISHTA = ["ळ"]
EXTENDED_VYANJANA = ["ऩ", "ऱ", "ऴ", "क़", "ख़", "ग़", "ज़", "ड़", "ढ़", "फ़", "य़"]
# --------------------------------------------------------------------------- #
ARTIFICIAL_MATRA_A = f"-{SWARA[0]}"
OM = "ॐ"
AVAGRAHA = "ऽ"
SWARITA = "॑"
DOUBLE_SWARITA = "᳚"
TRIPLE_SWARITA = "᳛"
ANUDATTA = "॒"
CHANDRABINDU = "ँ"
CHANDRABINDU_VIRAMA = "ꣳ"
CHANDRABINDU_SPACING = "ꣲ"
CHANDABINDU_TWO = "ꣵ"
CHANDRABINDU_THREE = "ꣶ"
ANUSWARA = "ं"
VISARGA = "ः"
ARDHAVISARGA = "ᳲ"
JIHVAAMULIYA = "ᳵ"
UPADHMANIYA = "ᳶ"
HALANTA = "्"
NUKTA = "़" # unused
ABBREV = "॰"
DANDA = "।"
DOUBLE_DANDA = "॥"
# --------------------------------------------------------------------------- #
# Groups
VARGIYA = KANTHYA + TALAVYA + MURDHANYA + DANTYA + AUSHTHYA
VYANJANA = VARGIYA + ANTAHSTHA + USHMA + VISHISHTA
VARGA_PRATHAMA = [VARGIYA[i * 5] for i in range(5)]
VARGA_DWITIYA = [VARGIYA[i * 5 + 1] for i in range(5)]
VARGA_TRITIYA = [VARGIYA[i * 5 + 2] for i in range(5)]
VARGA_CHATURTHA = [VARGIYA[i * 5 + 3] for i in range(5)]
VARGA_PANCHAMA = [VARGIYA[i * 5 + 4] for i in range(5)]
LAGHU_SWARA = [SWARA[i] for i in [0, 2, 4, 6, 8]] + EXTENDED_SWARA[:2]
LAGHU_MATRA = [MATRA[i] for i in [1, 3, 5, 7]] + EXTENDED_MATRA[:2]
# --------------------------------------------------------------------------- #
AYOGAVAAHA_COMMON = [CHANDRABINDU, ANUSWARA, VISARGA]
AYOGAVAAHA = AYOGAVAAHA_COMMON + [JIHVAAMULIYA, UPADHMANIYA]
VEDIC_MARKS = [SWARITA, ANUDATTA, DOUBLE_SWARITA, TRIPLE_SWARITA]
SPECIAL = [
AVAGRAHA,
OM,
CHANDRABINDU_VIRAMA,
CHANDRABINDU_SPACING,
CHANDABINDU_TWO,
CHANDRABINDU_THREE,
]
OTHER = [HALANTA]
# --------------------------------------------------------------------------- #
ARTIFICIAL_MATRA = [ARTIFICIAL_MATRA_A]
ALL_SWARA = SWARA + EXTENDED_SWARA
ALL_VYANJANA = VYANJANA + EXTENDED_VYANJANA
ALL_MATRA = MATRA + EXTENDED_MATRA # does NOT contain ARTIFICIAL_MATRA
VARNA = ALL_SWARA + ALL_VYANJANA
ALPHABET = VARNA + ALL_MATRA + AYOGAVAAHA + SPECIAL + OTHER + VEDIC_MARKS
# --------------------------------------------------------------------------- #
SPACES = [" ", "\t", "\n", "\r"]
PUNCTUATION = [DANDA, DOUBLE_DANDA, ABBREV]
GENERAL_PUNCTUATION = [".", ",", ";", "", '"', "'", "`"]
DIGITS = ["०", "१", "२", "३", "४", "५", "६", "७", "८", "९"]
COMBINING_DIGIT_MARKS = ["꣠", "꣡", "꣢", "꣣", "꣤", "꣥", "꣦", "꣧", "꣨", "꣩"]
# --------------------------------------------------------------------------- #
# Special Sequences
KSHA = "क्ष"
JNA = "ज्ञ"
###############################################################################
HOW_TO_WRITE = """
Unicode characters chan be typed directly from the keyboard as follows,
[Ctrl+Shift+u] [4-digit-unicode-identifier] [space]
Some of the characters can also be typed using m17n-sanskrit-itrans keyboard
(Package: https://launchpad.net/ubuntu/+source/ibus-m17n)
(File: /usr/share/m17n/sa-itrans.mim)
Notable Unicodes and Shortcuts
---
1cf2 for Ardhavisarga
1cf5 for Jihvamuliya -- kH
1cf6 for Upadhmaniya -- pH
0951 for Swarita -- ''
0952 for Anudatta -- _
0901 for Chandrabindu -- .N
a8f2 for (stand-alone) Chandrabindu Spacing
093d for Avagraha -- .a
094d for Halanta -- .h
0950 for Om -- OM
a8e0 to a8e9 for Combining Devanagari Digits 0-9 (Swara Marks for Samaveda)
"""
###############################################################################
MAAHESHWARA_SUTRA = [
["अ", "इ", "उ", "ण्"],
["ऋ", "ऌ", "क्"],
["ए", "ओ", "ङ्"],
["ऐ", "औ", "च्"],
["ह", "य", "व", "र", "ट्"],
["ल", "ण्"],
["ञ", "म", "ङ", "ण", "न", "म्"],
["झ", "भ", "ञ्"],
["घ", "ढ", "ध", "ष्"],
["ज", "ब", "ग", "ड", "द", "श्"],
["ख", "फ", "छ", "ठ", "थ", "च", "ट", "त", "व्"],
["क", "प", "य्"],
["श", "ष", "स", "र्"],
["ह", "ल्"],
]
# --------------------------------------------------------------------------- #
MAAHESHWARA_KRAMA = [varna for sutra in MAAHESHWARA_SUTRA for varna in sutra]
# --------------------------------------------------------------------------- #
MAAHESHWARA_IDX = defaultdict(list)
idx = 0
for _sutra_idx, sutra in enumerate(MAAHESHWARA_SUTRA):
for _internal_idx, varna in enumerate(sutra):
if HALANTA in varna:
_idx = -1
else:
_idx = idx
idx += 1
MAAHESHWARA_IDX[varna].append((_sutra_idx, _internal_idx, _idx))
###############################################################################
[docs]def resolve_pratyaahaara(pratyaahaara: str) -> List[List[str]]:
"""Resolve pratyaahaara into all possible lists of characters"""
aadi = pratyaahaara[0]
antya = pratyaahaara[1:]
possible_starts = []
possible_ends = []
for idx, varna in enumerate(MAAHESHWARA_KRAMA):
if varna == aadi:
possible_starts.append(idx)
if varna == antya:
possible_ends.append(idx)
resolutions = [
[
MAAHESHWARA_KRAMA[idx]
for idx in range(start, end)
if HALANTA not in MAAHESHWARA_KRAMA[idx]
]
for start in possible_starts
for end in possible_ends
if start < end
]
return resolutions
###############################################################################
[docs]def clean(
text: str,
punct: bool = False,
digits: bool = False,
spaces: bool = True,
allow: list = None,
) -> str:
"""Clean a line of Sanskrit (Devanagari) text
Parameters
----------
text : str
Input string
punct : bool, optional
If True, the punctuations are kept.
The default is False.
digits : bool, optional
If True, digits are kept.
The default is False.
spaces : bool, optional
If False, spaces are removed.
It is recommended to not change the default value
unless it is specifically relevant to a use-case.
The default is True.
allow : list, optional
List of characters to allow.
The default is None.
Returns
-------
str
Clean version of the string
"""
allow = allow or []
alphabet = ALPHABET + allow
if spaces:
alphabet += SPACES
if punct:
alphabet += PUNCTUATION + GENERAL_PUNCTUATION
if digits:
alphabet += DIGITS
answer = "".join(["" if c not in alphabet else c for c in text])
answer = "\n".join(
[" ".join(line.split()) for line in answer.split("\n") if line.strip()]
)
return answer
[docs]def split_lines(text: str, pattern=r"[।॥\r\n]+") -> List[str]:
"""Split a string into a list of strings using regular expression
Parameters
----------
text : str
Input string
pattern : regexp, optional
Regular expression corresponding to the split points.
The default is r'[।॥\\r\\n]+'.
Returns
-------
List[str]
List of strings
"""
return list(filter(None, re.split(pattern, text)))
###############################################################################
[docs]def trim_matra(line: str) -> str:
"""
Trim trailing mātrā and related markers from the end of a string.
This is a simple orthographic heuristic intended for rough
normalisation (for example, comparing or grouping word-final
consonant bases). It is **not** a linguistically realistic notion
of stemming or lemmatisation and should not be used as such.
The function removes, in order:
1. A final anusvāra/halanta/visarga, if present.
2. A final mātrā character, if present after step (1).
If the input string is empty, it is returned unchanged.
"""
if not line:
return line
answer = line
if answer and answer[-1] in [ANUSWARA, HALANTA, VISARGA]:
answer = answer[:-1]
if answer and answer[-1] in ALL_MATRA:
answer = answer[:-1]
return answer
###############################################################################
[docs]def is_laghu(syllable: str) -> bool:
"""Checks if the current syllable is Laghu"""
return all(
[
(
x in ALL_VYANJANA
or x in LAGHU_SWARA
or x in LAGHU_MATRA
or x == HALANTA
)
for x in syllable
]
)
[docs]def toggle_matra(syllable: str) -> str:
"""Change the Laghu syllable to Guru and Guru to Laghu (if possible)"""
if syllable[-1] in MATRA:
index = MATRA.index(syllable[-1])
if index in [2, 4, 6, 8]:
return syllable[:-1] + MATRA[index - 1]
if index in [1, 3, 5, 7]:
return syllable[:-1] + MATRA[index + 1]
if syllable in SWARA:
index = SWARA.index(syllable)
if index in [0, 2, 4, 6, 8]:
return SWARA[index + 1]
if index in [1, 3, 5, 7, 9]:
return SWARA[index - 1]
###############################################################################
[docs]def marker_to_swara(m: str) -> str:
"""Convert a Matra to corresponding Swara"""
if m in ARTIFICIAL_MATRA:
return m[1:]
if m in MATRA:
m_idx = MATRA.index(m)
return SWARA[m_idx + 1]
elif m in EXTENDED_MATRA:
m_idx = EXTENDED_MATRA.index(m)
return EXTENDED_SWARA[m_idx]
return None
[docs]def swara_to_marker(s: str) -> str:
"""Convert a Swara to correponding Matra"""
if s == SWARA[0]:
return f"-{s}"
if s in SWARA:
s_idx = SWARA.index(s)
return MATRA[s_idx - 1]
if s in EXTENDED_SWARA[:-1]:
s_idx = EXTENDED_SWARA.index(s)
return EXTENDED_MATRA[s_idx]
return None
###############################################################################
[docs]def get_anunaasika(ch: str) -> str:
"""Get the appropriate anunaasika from the character's group"""
MA = AUSHTHYA[4]
if ch == "":
return MA
if ch in VYANJANA:
i = VYANJANA.index(ch)
if i < 25:
return VYANJANA[int(i / 5) * 5 + 4]
else:
return ANUSWARA
else:
return ANUSWARA
[docs]def fix_anuswara(text: str) -> str:
"""
Check every anuswaara in the text and change to anunaasika if applicable
"""
output_chars = []
if text:
for idx in range(len(text) - 1):
char = text[idx]
next_char = text[idx + 1]
if char == ANUSWARA and next_char in VARGIYA:
anunasika = get_anunaasika(next_char)
output_chars.append(anunasika)
output_chars.append(HALANTA)
else:
output_chars.append(char)
output_chars.append(text[-1])
return "".join(output_chars)
###############################################################################
[docs]def get_syllables_word(word: str, technical: bool = False) -> List[str]:
"""Get syllables from a Sanskrit (Devanagari) word
Parameters
----------
word : str
Sanskrit (Devanagari) word to get syllables from.
Spaces, if present, are ignored.
technical : bool, optional
If True, ensures that each element contains at most
one Swara or Vyanjana.
The default is False.
Returns
-------
List[str]
List of syllables
"""
word = clean(word, spaces=False)
wlen = len(word)
word_syllables = []
current = ""
i = 0
while i < wlen:
curr_ch = word[i]
current += curr_ch
i += 1
# words split to start at START_CHARS
start_chars = VARNA + SPECIAL
if technical:
start_chars += AYOGAVAAHA_COMMON
while i < wlen and word[i] not in start_chars:
current += word[i]
i += 1
if current[-1] != HALANTA or i == wlen or technical:
word_syllables.append(current)
current = ""
return word_syllables
[docs]def get_syllables(text: str, technical: bool = False) -> List[List[List[str]]]:
"""Get syllables from a Sanskrit (Devanagari) text
Parameters
----------
text : str
Sanskrit (Devanagari) text to get syllables from
technical : bool, optional
If True, ensures that each element contains at most
one Swara or Vyanjana.
The default is False.
Returns
-------
List[List[List[str]]]
List of syllables in a nested list format
Nesting Levels: Text -> Lines -> Words
"""
lines = split_lines(text.strip())
syllables = []
for line in lines:
words = line.split()
line_syllables = []
for word in words:
word_syllables = get_syllables_word(word, technical)
line_syllables.append(word_syllables)
syllables.append(line_syllables)
return syllables
###############################################################################
[docs]def split_varna_word(word: str, technical: bool = True) -> List[str]:
"""Obtain the Varna decomposition of a Sanskrit (Devanagari) word
Parameters
----------
word : str
Sanskrit (Devanagari) word to be split.
technical : bool, optional
If True, a split, vowels and vowel signs are treated independently
which is more useful for analysis,
The default is True.
Returns
-------
List[str]
List of Varna
"""
word_syllables = get_syllables_word(word, True)
word_viccheda = []
for syllable in word_syllables:
if syllable[0] in ALL_SWARA:
word_viccheda.append(syllable[0])
if len(syllable) > 1:
word_viccheda.append(syllable[1])
# TODO: Will this ever be the case?
if len(syllable) > 2:
LOGGER.warning(f"Long SWARA: {syllable}")
word_viccheda.append(syllable[2:])
elif syllable[0] in ALL_VYANJANA:
word_viccheda.append(syllable[0] + HALANTA)
if len(syllable) == 1:
word_viccheda.append(ARTIFICIAL_MATRA_A)
if len(syllable) > 1:
if syllable[1] in AYOGAVAAHA_COMMON:
word_viccheda.append(ARTIFICIAL_MATRA_A)
if syllable[1] != HALANTA:
word_viccheda.append(syllable[1])
# TODO: Will this ever be the case?
if len(syllable) > 2:
LOGGER.warning(f"Long VYANJANA: {syllable}")
word_viccheda.append(syllable[2:])
else:
word_viccheda.append(syllable)
if not technical:
real_word_viccheda = []
for varna in word_viccheda:
if varna in ARTIFICIAL_MATRA + ALL_MATRA:
real_word_viccheda.append(marker_to_swara(varna))
elif varna in AYOGAVAAHA_COMMON:
real_word_viccheda[-1] += varna
else:
real_word_viccheda.append(varna)
word_viccheda = real_word_viccheda
return word_viccheda
[docs]def split_varna(
text: str, technical: bool = True, flat: bool = False
) -> List[List[List[str]]] or List[str]:
"""Obtain the Varna decomposition of a Sanskrit (Devanagari) text
Parameters
----------
word : str
Sanskrit (Devanagari) text to be split.
technical : bool, optional
If True, a split, vowels and vowel signs are treated independently
which is more useful for analysis,
The default is True.
flat : bool, optional
If True, a single list is returned instead of nested lists.
The default is False.
Returns
-------
List[List[List[str]]] or List[str]
Varna decomposition of the text in a nested list format.
Nesting Levels: Text -> Lines -> Words
- Varna decomposition of each word is a List[char].
- List of Varna decomposition of each word from a line.
- List of Varna decomposition of each line from the text.
If `flat=True`, Varna decomposition of the entire text is presented
as a single list, also containing whitespace markers.
Lines are separated by a newline character '\\n' and words are
separated by a space character ' '.
"""
lines = split_lines(text.strip())
viccheda = []
num_lines = len(lines)
for line_idx, line in enumerate(lines):
words = line.split()
line_viccheda = []
num_words = len(words)
for word_idx, word in enumerate(words):
word_viccheda = split_varna_word(word, technical)
if flat:
line_viccheda.extend(word_viccheda)
if word_idx != num_words - 1:
line_viccheda.append(" ")
else:
line_viccheda.append(word_viccheda)
if flat:
viccheda.extend(line_viccheda)
if line_idx != num_lines - 1:
viccheda.append("\n")
else:
viccheda.append(line_viccheda)
return viccheda
[docs]def join_varna(viccheda: str, technical: bool = True) -> str:
"""
Join Varna decomposition to form a Sanskrit (Devanagari) word
Parameters
----------
viccheda : list
Viccheda output obtained by `split_varna_word` with `technical=True`
(or output of `split_varna` with `technical=True` and `flat=True`)
IMPORTANT: `technical=True` is necessary.
technical : bool
WARNING: Currently unused.
Value of the same parameter passed to `split_varna_word`
NOTE
----
Currently only works for the viccheda generated with `technical=True`
Returns
-------
s : str
Sanskrit word
"""
word = []
i = 0
while i < len(viccheda):
curr_varna = viccheda[i]
next_varna = ""
if i < len(viccheda) - 1:
next_varna = viccheda[i + 1]
i += 1
if curr_varna in [" ", "\n"]:
word.append(curr_varna)
continue
if curr_varna[0] in ALL_SWARA + SPECIAL:
word.append(curr_varna[0])
if curr_varna[-1] in AYOGAVAAHA_COMMON:
word.append(curr_varna[-1])
if curr_varna[-1] == HALANTA:
if next_varna in [" ", "\n"]:
word.append(curr_varna)
continue
if next_varna == "":
word.append(curr_varna)
break
if next_varna[-1] == HALANTA:
word.append(curr_varna)
if next_varna[0] in ALL_SWARA:
i += 1
word.append(curr_varna[:-1])
if next_varna[0] != SWARA[0]:
word.append(marker_to_swara(next_varna[0]))
if next_varna[-1] == VISARGA:
# NOTE: This was mostly meant to handle version with
# `technical=False`
LOGGER.warning(
f"Next Varna is SWARA + VISARGA: {next_varna}"
)
word.append(next_varna[-1])
if next_varna in AYOGAVAAHA_COMMON:
i += 1
word.append(curr_varna[:-1] + next_varna)
if next_varna in ARTIFICIAL_MATRA + ALL_MATRA:
i += 1
word.append(curr_varna[:-1])
if next_varna != ARTIFICIAL_MATRA_A:
word.append(next_varna)
if curr_varna in ARTIFICIAL_MATRA + ALL_MATRA + AYOGAVAAHA_COMMON:
word.append(curr_varna)
return "".join(word)
###############################################################################
###############################################################################
# Ucchaarana Sthaana Module
# ------------------------
STHAANA = {
"S_K": ["अ", "आ"] + KANTHYA + ["ह"] + [VISARGA],
"S_T": ["इ", "ई"] + TALAVYA + ["य", "श"],
"S_M": ["ऋ", "ॠ"] + MURDHANYA + ["र", "ष"],
"S_D": ["ऌ", "ॡ"] + DANTYA + ["ल", "स"],
"S_O": ["उ", "ऊ"] + AUSHTHYA + [UPADHMANIYA],
"S_N": VARGA_PANCHAMA + [ANUSWARA],
"S_KT": ["ए", "ऐ"],
"S_KO": ["ओ", "औ"],
"S_DO": ["व"],
"S_JM": [JIHVAAMULIYA],
}
STHAANA_NAMES = {
"S_K": "कण्ठः",
"S_T": "तालु",
"S_M": "मूर्धा",
"S_D": "दन्ताः",
"S_O": "ओष्ठौ",
"S_N": "नासिका",
"S_KT": "कण्ठतालु",
"S_KO": "कण्ठौष्ठम्",
"S_DO": "दन्तौष्ठम्",
"S_JM": "जिह्वामूलम्",
}
###############################################################################
AABHYANTARA = {
"A_SP": VARGIYA,
"A_ISP": ANTAHSTHA,
"A_IVVT": USHMA + [JIHVAAMULIYA, UPADHMANIYA],
"A_VVT": SWARA[1:] + [CHANDRABINDU, ANUSWARA, VISARGA],
"A_SVT": SWARA[:1],
}
AABHYANTARA_NAMES = {
"A_SP": "स्पृष्टः",
"A_ISP": "ईषत्स्पृष्टः",
"A_IVVT": "ईषद्विवृतः",
"A_VVT": "विवृतः",
"A_SVT": "संवृतः",
}
###############################################################################
BAAHYA = {
"B_VVR": resolve_pratyaahaara("खर्")[0],
"B_SVR": resolve_pratyaahaara("हश्")[0] + SWARA,
"B_SW": resolve_pratyaahaara("खर्")[0],
"B_ND": resolve_pratyaahaara("हश्")[0] + SWARA,
"B_GH": resolve_pratyaahaara("हश्")[0] + SWARA,
"B_AGH": resolve_pratyaahaara("खर्")[0],
"B_AP": (
VARGA_PRATHAMA
+ VARGA_TRITIYA
+ VARGA_PANCHAMA
+ resolve_pratyaahaara("यण्")[0]
)
+ SWARA
+ [CHANDRABINDU, ANUSWARA],
"B_MP": (VARGA_DWITIYA + VARGA_CHATURTHA + resolve_pratyaahaara("शल्")[0])
+ [VISARGA, JIHVAAMULIYA, UPADHMANIYA],
"B_U": SWARA,
"B_ANU": [s + ANUDATTA for s in SWARA],
"B_SWA": [s + SWARITA for s in SWARA],
}
BAAHYA_NAMES = {
"B_VVR": "विवारः",
"B_SVR": "संवारः",
"B_SW": "श्वासः",
"B_ND": "नादः",
"B_GH": "घोषः",
"B_AGH": "अघोषः",
"B_AP": "अल्पप्राणः",
"B_MP": "महाप्राणः",
"B_U": "उदात्तः",
"B_ANU": "अनुदात्तः",
"B_SWA": "स्वरितः",
}
###############################################################################
UCCHAARANA = dict(**STHAANA, **AABHYANTARA, **BAAHYA)
UCCHAARANA_NAMES = dict(**STHAANA_NAMES, **AABHYANTARA_NAMES, **BAAHYA_NAMES)
###############################################################################
[docs]def get_ucchaarana_vector(letter: str, abbrev=False) -> Dict[str, int]:
"""
Get ucchaarana sthaana and prayatna based vector of a letter
Parameters
----------
letter : str
Sanskrit letter
abbrev : bool
If True, the output will contain English abbreviations
otherwise, the output will contain Sanskrit names.
The default is False.
Returns
-------
vector : Dict[str, int]
One-hot vector indicating utpatti sthaana, aabhyantara prayatna and
baahya prayatna of a letter
"""
varna = letter.replace(HALANTA, "") if letter.endswith(HALANTA) else letter
if abbrev:
def ucchaarana_name(s):
return s
else:
def ucchaarana_name(s):
return UCCHAARANA_NAMES[s]
vector = {ucchaarana_name(k): 0 for k in UCCHAARANA}
for s, varna_list in UCCHAARANA.items():
if varna in varna_list:
vector[ucchaarana_name(s)] = 1
return vector
[docs]def get_ucchaarana_vectors(
word: str, abbrev: bool = False
) -> List[Tuple[str, Dict[str, int]]]:
"""
Get ucchaarana sthaana and prayatna based vector of a word or text
Parameters
----------
word : str
Sanskrit word (or text)
abbrev : bool
If True, the output will contain English abbreviations
otherwise, the output will contain Sanskrit names.
The default is False.
Returns
-------
vectors : List[Tuple[str, Dict[str, int]]]
List of (letter, vector)
"""
letters = []
for letter in split_varna_word(word, technical=False):
if [v for v in AYOGAVAAHA_COMMON if v in letter]:
letters.extend(letter)
else:
letters.append(letter)
return [
(letter, get_ucchaarana_vector(letter, abbrev)) for letter in letters
]
###############################################################################
[docs]def get_signature_letter(letter: str, abbrev: bool = False) -> Dict[str, str]:
"""
Get ucchaarana sthaana and prayatna based signature of a letter
Parameters
----------
letter : str
Sanskrit letter
abbrev : bool
If True, the output will contain English abbreviations
otherwise, the output will contain Sanskrit names.
The default is False.
Returns
-------
signature : Dict[str, str]
utpatti sthaana, aabhyantara prayatna and baahya prayatna of a letter
"""
sthaana = get_ucchaarana_letter(letter, dimension=0, abbrev=abbrev)
aabhyantara = get_ucchaarana_letter(letter, dimension=1, abbrev=abbrev)
baahya = get_ucchaarana_letter(letter, dimension=2, abbrev=abbrev)
signature = {
"sthaana": sthaana,
"aabhyantara": aabhyantara,
"baahya": baahya,
}
return signature
[docs]def get_signature_word(
word: str, abbrev: bool = False
) -> List[Tuple[str, Dict[str, str]]]:
"""
Get ucchaarana sthaana and prayatna based signature of a word
Parameters
----------
word : str
Sanskrit word (or text)
Caution: If multiple words are provided,
the spaces are not included in the output list.
abbrev : bool
If True, the output will contain English abbreviations
otherwise, the output will contain Sanskrit names.
The default is False.
Returns
-------
List[Tuple[str, Dict[str, str]]]
List of (letter, signature)
"""
letters = []
for letter in split_varna_word(word, technical=False):
if [v for v in AYOGAVAAHA_COMMON if v in letter]:
letters.extend(letter)
else:
letters.append(letter)
return [
(letter, get_signature_letter(letter, abbrev)) for letter in letters
]
[docs]def get_signature(
text: str, abbrev: bool = False
) -> List[List[List[Tuple[str, Dict[str, str]]]]]:
"""
Get ucchaarana list of a Sanskrit text
Parameters
----------
text : str
Sanskrit text (can contain newlines, spaces)
abbrev : bool
If True, the output will contain English abbreviations
otherwise, the output will contain Sanskrit names.
The default is False.
Returns
-------
List[List[List[Tuple[str, Dict[str, str]]]]]
List of (letter, signature) for words in a nested list format
Nesting Levels: Text -> Lines -> Words
"""
lines = split_lines(text.strip())
signature = []
for line in lines:
words = line.split()
line_signature = []
for word in words:
word_signature = get_signature_word(word, abbrev)
line_signature.append(word_signature)
signature.append(line_signature)
return signature
###############################################################################
[docs]def get_ucchaarana_letter(
letter: str, dimension: int = 0, abbrev: bool = False
) -> str:
"""
Get ucchaarana sthaana or prayatna of a letter
Parameters
----------
letter : str
Sanskrit letter
dimension : int
- 0: sthaana
- 1: aabhyantara prayatna
- 2: baahya prayatna
The default is 0.
abbrev : bool
If True,
The output will contain English abbreviations
Otherwise,
The output will contain Sanskrit names
The default is False.
Returns
-------
str
ucchaarana sthaana or prayatna of a letter
"""
varna = letter.replace(HALANTA, "") if letter.endswith(HALANTA) else letter
ucchaarana = []
_UCCHAARANA = [STHAANA, AABHYANTARA, BAAHYA]
_NAMES = [STHAANA_NAMES, AABHYANTARA_NAMES, BAAHYA_NAMES]
if abbrev:
def ucchaarana_name(s):
return s
join_str = "-"
else:
def ucchaarana_name(s):
return _NAMES[dimension][s]
join_str = " "
for s, varna_list in _UCCHAARANA[dimension].items():
if varna in varna_list:
ucchaarana.append(ucchaarana_name(s))
if len(ucchaarana) > 1 and not abbrev:
ucchaarana.append("च")
return join_str.join(ucchaarana)
[docs]def get_ucchaarana_word(
word: str, dimension: int = 0, abbrev: bool = False
) -> List[Tuple[str, str]]:
"""
Get ucchaarana of a word
Parameters
----------
word : str
Sanskrit word (or text)
**Caution**: If multiple words are provided, the spaces are not included in
the output list
dimension : int
- 0: sthaana
- 1: aabhyantara prayatna
- 2: baahya prayatna
The default is 0.
abbrev : bool
If True,
The output will contain English abbreviations
Otherwise,
The output will contain Sanskrit names
The default is False.
Returns
-------
List[Tuple[str, str]]
List of (letter, ucchaarana)
"""
letters = []
for letter in split_varna_word(word, technical=False):
if [v for v in AYOGAVAAHA_COMMON if v in letter]:
letters.extend(letter)
else:
letters.append(letter)
return [
(letter, get_ucchaarana_letter(letter, dimension, abbrev))
for letter in letters
]
[docs]def get_ucchaarana(
text: str, dimension: int = 0, abbrev: bool = False
) -> List[List[List[Tuple[str, str]]]]:
"""
Get ucchaarana list of a Sanskrit text
Parameters
----------
text : str
Sanskrit text (can contain newlines, spaces)
dimension : int
- 0: sthaana
- 1: aabhyantara prayatna
- 2: baahya prayatna
The default is 0.
abbrev : bool
If True,
The output will contain English abbreviations
Otherwise,
The output will contain Sanskrit names
The default is False.
Returns
-------
List[List[List[Tuple[str, str]]]]
List of (letter, ucchaarana) for words in a nested list format
Nesting Levels: Text -> Lines -> Words
"""
lines = split_lines(text.strip())
ucchaarana = []
for line in lines:
words = line.split()
line_ucchaarana = []
for word in words:
word_ucchaarana = get_ucchaarana_word(word, dimension, abbrev)
line_ucchaarana.append(word_ucchaarana)
ucchaarana.append(line_ucchaarana)
return ucchaarana
###############################################################################
[docs]def get_sthaana_letter(letter: str, abbrev: bool = False):
"""Wrapper for get_ucchaarana_letter for sthaana"""
return get_ucchaarana_letter(letter, dimension=0, abbrev=abbrev)
[docs]def get_sthaana_word(word: str, abbrev: bool = False):
"""Wrapper for get_ucchaarana_word for sthaana"""
return get_ucchaarana_word(word, dimension=0, abbrev=abbrev)
[docs]def get_sthaana(text: str, abbrev: bool = False):
"""Wrapper for get_ucchaarana for sthaana"""
return get_ucchaarana(text, dimension=0, abbrev=abbrev)
# --------------------------------------------------------------------------- #
[docs]def get_aabhyantara_letter(letter: str, abbrev: bool = False):
"""Wrapper for get_ucchaarana_letter for aabhyantara"""
return get_ucchaarana_letter(letter, dimension=1, abbrev=abbrev)
[docs]def get_aabhyantara_word(word: str, abbrev: bool = False):
"""Wrapper for get_ucchaarana_word for aabhyantara"""
return get_ucchaarana_word(word, dimension=1, abbrev=abbrev)
[docs]def get_aabhyantara(text: str, abbrev: bool = False):
"""Wrapper for get_ucchaarana for aabhyantara"""
return get_ucchaarana(text, dimension=1, abbrev=abbrev)
# --------------------------------------------------------------------------- #
[docs]def get_baahya_letter(letter: str, abbrev: bool = False):
"""Wrapper for get_ucchaarana_letter for baahya"""
return get_ucchaarana_letter(letter, dimension=2, abbrev=abbrev)
[docs]def get_baahya_word(word: str, abbrev: bool = False):
"""Wrapper for get_ucchaarana_word for baahya"""
return get_ucchaarana_word(word, dimension=2, abbrev=abbrev)
[docs]def get_baahya(text: str, abbrev: bool = False):
"""Wrapper for get_ucchaarana for baahya"""
return get_ucchaarana(text, dimension=2, abbrev=abbrev)
###############################################################################