Source code for sanskrit_text

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Sanskrit Text Utility"""

###############################################################################

__author__ = """Hrishikesh Terdalkar"""
__email__ = "hrishikeshrt@linuxmail.org"
__version__ = "1.0.0"
__created_at__ = "Tue Apr 17 22:20:39 2018"

###############################################################################

import re
import logging

from collections import defaultdict
from itertools import product
from typing import Dict, List, Tuple

###############################################################################

LOGGER = logging.getLogger(__name__)

###############################################################################


[docs]def ord_unicode(ch: str) -> str:
    """Get Unicode 4-character-identifier corresponding to a character

    Parameters
    ----------
    ch : str
        Single character

    Returns
    -------
    str
        4-character unicode identifier
    """
    return hex(ord(ch)).split("x")[1].zfill(4)


[docs]def chr_unicode(u: str) -> str:
    """Get a Unicode character corresponding to 4-chracater identifier

    Parameters
    ----------
    u : str
        4-character unicode identifier

    Returns
    -------
    str
        Single character
    """

    return chr(int(u, 16))


###############################################################################

DESIGN = """
* len(SWARA) == len(MATRA) + 1  # 'अ' is extra at the beginning
* len(EXTENDED_SWARA) == len(EXTENDED_MATRA) + 1 # 'ऍ' is extra at the end
* It is unclear which of 'ॲ' or 'ऍ' should correspond to 'ॅ', current choice is
  the former. If that changes, the order in EXTENDED_MATRA would need to change
* ARTIFICIAL_MATRA contains absent vowel signs. Currently this is just for 'अ'.
  Any new sign should follow the pattern as hyphen ('-') followed by the vowel
  letter (e.g. '-अ').
* DIGITS, COMBINING_DIGIT_MARKS, PUNCTUATION and GENERAL_PUNCTUATION aren't
  part of the ALPHABET. Their inclusion needs more deliberation.
* VEDIC_MARKS are not used in syllabification functions currently.
"""

###############################################################################
# Alphabet of Sanskrit

SWARA = ["अ", "आ", "इ", "ई", "उ", "ऊ", "ऋ", "ॠ", "ऌ", "ॡ", "ए", "ऐ", "ओ", "औ"]
EXTENDED_SWARA = ["ऎ", "ऒ", "ॲ", "ऑ", "ऍ"]

MATRA = ["ा", "ि", "ी", "ु", "ू", "ृ", "ॄ", "ॢ", "ॣ", "े", "ै", "ो", "ौ"]
EXTENDED_MATRA = ["ॆ", "ॊ", "ॅ", "ॉ"]

KANTHYA = ["क", "ख", "ग", "घ", "ङ"]
TALAVYA = ["च", "छ", "ज", "झ", "ञ"]
MURDHANYA = ["ट", "ठ", "ड", "ढ", "ण"]
DANTYA = ["त", "थ", "द", "ध", "न"]
AUSHTHYA = ["प", "फ", "ब", "भ", "म"]
ANTAHSTHA = ["य", "र", "ल", "व"]
USHMA = ["श", "ष", "स", "ह"]
VISHISHTA = ["ळ"]
EXTENDED_VYANJANA = ["ऩ", "ऱ", "ऴ", "क़", "ख़", "ग़", "ज़", "ड़", "ढ़", "फ़", "य़"]

# --------------------------------------------------------------------------- #

ARTIFICIAL_MATRA_A = f"-{SWARA[0]}"

OM = "ॐ"
AVAGRAHA = "ऽ"

SWARITA = "॑"
DOUBLE_SWARITA = "᳚"
TRIPLE_SWARITA = "᳛"
ANUDATTA = "॒"
CHANDRABINDU = "ँ"
CHANDRABINDU_VIRAMA = "ꣳ"
CHANDRABINDU_SPACING = "ꣲ"
CHANDABINDU_TWO = "ꣵ"
CHANDRABINDU_THREE = "ꣶ"

ANUSWARA = "ं"
VISARGA = "ः"
ARDHAVISARGA = "ᳲ"
JIHVAAMULIYA = "ᳵ"
UPADHMANIYA = "ᳶ"

HALANTA = "्"
NUKTA = "़"  # unused
ABBREV = "॰"
DANDA = "।"
DOUBLE_DANDA = "॥"

# --------------------------------------------------------------------------- #
# Groups

VARGIYA = KANTHYA + TALAVYA + MURDHANYA + DANTYA + AUSHTHYA
VYANJANA = VARGIYA + ANTAHSTHA + USHMA + VISHISHTA

VARGA_PRATHAMA = [VARGIYA[i * 5] for i in range(5)]
VARGA_DWITIYA = [VARGIYA[i * 5 + 1] for i in range(5)]
VARGA_TRITIYA = [VARGIYA[i * 5 + 2] for i in range(5)]
VARGA_CHATURTHA = [VARGIYA[i * 5 + 3] for i in range(5)]
VARGA_PANCHAMA = [VARGIYA[i * 5 + 4] for i in range(5)]

LAGHU_SWARA = [SWARA[i] for i in [0, 2, 4, 6, 8]] + EXTENDED_SWARA[:2]
LAGHU_MATRA = [MATRA[i] for i in [1, 3, 5, 7]] + EXTENDED_MATRA[:2]

# --------------------------------------------------------------------------- #

AYOGAVAAHA_COMMON = [CHANDRABINDU, ANUSWARA, VISARGA]
AYOGAVAAHA = AYOGAVAAHA_COMMON + [JIHVAAMULIYA, UPADHMANIYA]

VEDIC_MARKS = [SWARITA, ANUDATTA, DOUBLE_SWARITA, TRIPLE_SWARITA]
SPECIAL = [
    AVAGRAHA,
    OM,
    CHANDRABINDU_VIRAMA,
    CHANDRABINDU_SPACING,
    CHANDABINDU_TWO,
    CHANDRABINDU_THREE,
]
OTHER = [HALANTA]

# --------------------------------------------------------------------------- #

ARTIFICIAL_MATRA = [ARTIFICIAL_MATRA_A]

ALL_SWARA = SWARA + EXTENDED_SWARA
ALL_VYANJANA = VYANJANA + EXTENDED_VYANJANA
ALL_MATRA = MATRA + EXTENDED_MATRA  # does NOT contain ARTIFICIAL_MATRA

VARNA = ALL_SWARA + ALL_VYANJANA
ALPHABET = VARNA + ALL_MATRA + AYOGAVAAHA + SPECIAL + OTHER + VEDIC_MARKS

# --------------------------------------------------------------------------- #

SPACES = [" ", "\t", "\n", "\r"]
PUNCTUATION = [DANDA, DOUBLE_DANDA, ABBREV]
GENERAL_PUNCTUATION = [".", ",", ";", "", '"', "'", "`"]

DIGITS = ["०", "१", "२", "३", "४", "५", "६", "७", "८", "९"]
COMBINING_DIGIT_MARKS = ["꣠", "꣡", "꣢", "꣣", "꣤", "꣥", "꣦", "꣧", "꣨", "꣩"]

# --------------------------------------------------------------------------- #
# Special Sequences

KSHA = "क्ष"
JNA = "ज्ञ"

###############################################################################


HOW_TO_WRITE = """
Unicode characters chan be typed directly from the keyboard as follows,
[Ctrl+Shift+u] [4-digit-unicode-identifier] [space]

Some of the characters can also be typed using m17n-sanskrit-itrans keyboard
(Package: https://launchpad.net/ubuntu/+source/ibus-m17n)
(File: /usr/share/m17n/sa-itrans.mim)


Notable Unicodes and Shortcuts
---
1cf2 for Ardhavisarga
1cf5 for Jihvamuliya -- kH
1cf6 for Upadhmaniya -- pH
0951 for Swarita -- ''
0952 for Anudatta -- _
0901 for Chandrabindu -- .N
a8f2 for (stand-alone) Chandrabindu Spacing
093d for Avagraha -- .a
094d for Halanta -- .h

0950 for Om -- OM
a8e0 to a8e9 for Combining Devanagari Digits 0-9 (Swara Marks for Samaveda)
"""

###############################################################################

MAAHESHWARA_SUTRA = [
    ["अ", "इ", "उ", "ण्"],
    ["ऋ", "ऌ", "क्"],
    ["ए", "ओ", "ङ्"],
    ["ऐ", "औ", "च्"],
    ["ह", "य", "व", "र", "ट्"],
    ["ल", "ण्"],
    ["ञ", "म", "ङ", "ण", "न", "म्"],
    ["झ", "भ", "ञ्"],
    ["घ", "ढ", "ध", "ष्"],
    ["ज", "ब", "ग", "ड", "द", "श्"],
    ["ख", "फ", "छ", "ठ", "थ", "च", "ट", "त", "व्"],
    ["क", "प", "य्"],
    ["श", "ष", "स", "र्"],
    ["ह", "ल्"],
]

# --------------------------------------------------------------------------- #

MAAHESHWARA_KRAMA = [varna for sutra in MAAHESHWARA_SUTRA for varna in sutra]

# --------------------------------------------------------------------------- #

MAAHESHWARA_IDX = defaultdict(list)

idx = 0
for _sutra_idx, sutra in enumerate(MAAHESHWARA_SUTRA):
    for _internal_idx, varna in enumerate(sutra):
        if HALANTA in varna:
            _idx = -1
        else:
            _idx = idx
            idx += 1
        MAAHESHWARA_IDX[varna].append((_sutra_idx, _internal_idx, _idx))

###############################################################################


[docs]def form_pratyaahaara(letters: List[str]) -> str:
    """Form a pratyaahaara from a list of letters"""
    varna_idx = []
    ignored = []

    for varna in letters:
        if varna in MAAHESHWARA_IDX and HALANTA not in varna:
            varna_idx.append(MAAHESHWARA_IDX[varna])
        else:
            ignored.append(varna)

    if ignored:
        LOGGER.info(f"Ignored letters: {ignored}")

    varna_idxs = product(*varna_idx)
    for v_idx in varna_idxs:
        v_idx = sorted(v_idx, key=lambda x: x[2])
        _v_idx = [w[2] for w in v_idx]
        if _v_idx != list(range(_v_idx[0], _v_idx[-1] + 1)):
            continue
        else:
            break
    else:
        LOGGER.warning("Cannot form a pratyaahara due to discontinuity.")
        return None

    _aadi_idx = v_idx[0]
    _pre_antya_idx = v_idx[-1]

    if _pre_antya_idx[1] != len(MAAHESHWARA_SUTRA[_pre_antya_idx[0]]) - 2:
        LOGGER.warning("Cannot form a pratyaahara due to end position.")
        return None

    aadi = MAAHESHWARA_SUTRA[_aadi_idx[0]][_aadi_idx[1]]
    antya = MAAHESHWARA_SUTRA[_pre_antya_idx[0]][-1]
    return f"{aadi}{antya}"


[docs]def resolve_pratyaahaara(pratyaahaara: str) -> List[List[str]]:
    """Resolve pratyaahaara into all possible lists of characters"""
    aadi = pratyaahaara[0]
    antya = pratyaahaara[1:]

    possible_starts = []
    possible_ends = []

    for idx, varna in enumerate(MAAHESHWARA_KRAMA):
        if varna == aadi:
            possible_starts.append(idx)
        if varna == antya:
            possible_ends.append(idx)

    resolutions = [
        [
            MAAHESHWARA_KRAMA[idx]
            for idx in range(start, end)
            if HALANTA not in MAAHESHWARA_KRAMA[idx]
        ]
        for start in possible_starts
        for end in possible_ends
        if start < end
    ]
    return resolutions

###############################################################################


[docs]def clean(
    text: str,
    punct: bool = False,
    digits: bool = False,
    spaces: bool = True,
    allow: list = None,
) -> str:
    """Clean a line of Sanskrit (Devanagari) text

    Parameters
    ----------
    text : str
        Input string
    punct : bool, optional
        If True, the punctuations are kept.
        The default is False.
    digits : bool, optional
        If True, digits are kept.
        The default is False.
    spaces : bool, optional
        If False, spaces are removed.
        It is recommended to not change the default value
        unless it is specifically relevant to a use-case.
        The default is True.
    allow : list, optional
        List of characters to allow.
        The default is None.

    Returns
    -------
    str
        Clean version of the string
    """
    allow = allow or []
    alphabet = ALPHABET + allow
    if spaces:
        alphabet += SPACES
    if punct:
        alphabet += PUNCTUATION + GENERAL_PUNCTUATION
    if digits:
        alphabet += DIGITS
    answer = "".join(["" if c not in alphabet else c for c in text])
    answer = "\n".join(
        [" ".join(line.split()) for line in answer.split("\n") if line.strip()]
    )
    return answer


[docs]def split_lines(text: str, pattern=r"[।॥\r\n]+") -> List[str]:
    """Split a string into a list of strings using regular expression

    Parameters
    ----------
    text : str
        Input string
    pattern : regexp, optional
        Regular expression corresponding to the split points.
        The default is r'[।॥\\r\\n]+'.

    Returns
    -------
    List[str]
        List of strings
    """
    return list(filter(None, re.split(pattern, text)))


###############################################################################


[docs]def trim_matra(line: str) -> str:
    """
    Trim trailing mātrā and related markers from the end of a string.

    This is a simple orthographic heuristic intended for rough
    normalisation (for example, comparing or grouping word-final
    consonant bases). It is **not** a linguistically realistic notion
    of stemming or lemmatisation and should not be used as such.

    The function removes, in order:

    1. A final anusvāra/halanta/visarga, if present.
    2. A final mātrā character, if present after step (1).

    If the input string is empty, it is returned unchanged.
    """
    if not line:
        return line

    answer = line
    if answer and answer[-1] in [ANUSWARA, HALANTA, VISARGA]:
        answer = answer[:-1]
    if answer and answer[-1] in ALL_MATRA:
        answer = answer[:-1]
    return answer


###############################################################################


[docs]def is_laghu(syllable: str) -> bool:
    """Checks if the current syllable is Laghu"""

    return all(
        [
            (
                x in ALL_VYANJANA
                or x in LAGHU_SWARA
                or x in LAGHU_MATRA
                or x == HALANTA
            )
            for x in syllable
        ]
    )


[docs]def toggle_matra(syllable: str) -> str:
    """Change the Laghu syllable to Guru and Guru to Laghu (if possible)"""
    if syllable[-1] in MATRA:
        index = MATRA.index(syllable[-1])
        if index in [2, 4, 6, 8]:
            return syllable[:-1] + MATRA[index - 1]
        if index in [1, 3, 5, 7]:
            return syllable[:-1] + MATRA[index + 1]

    if syllable in SWARA:
        index = SWARA.index(syllable)
        if index in [0, 2, 4, 6, 8]:
            return SWARA[index + 1]
        if index in [1, 3, 5, 7, 9]:
            return SWARA[index - 1]


###############################################################################


[docs]def marker_to_swara(m: str) -> str:
    """Convert a Matra to corresponding Swara"""
    if m in ARTIFICIAL_MATRA:
        return m[1:]

    if m in MATRA:
        m_idx = MATRA.index(m)
        return SWARA[m_idx + 1]
    elif m in EXTENDED_MATRA:
        m_idx = EXTENDED_MATRA.index(m)
        return EXTENDED_SWARA[m_idx]
    return None


[docs]def swara_to_marker(s: str) -> str:
    """Convert a Swara to correponding Matra"""
    if s == SWARA[0]:
        return f"-{s}"

    if s in SWARA:
        s_idx = SWARA.index(s)
        return MATRA[s_idx - 1]
    if s in EXTENDED_SWARA[:-1]:
        s_idx = EXTENDED_SWARA.index(s)
        return EXTENDED_MATRA[s_idx]
    return None


###############################################################################


[docs]def get_anunaasika(ch: str) -> str:
    """Get the appropriate anunaasika from the character's group"""
    MA = AUSHTHYA[4]
    if ch == "":
        return MA
    if ch in VYANJANA:
        i = VYANJANA.index(ch)
        if i < 25:
            return VYANJANA[int(i / 5) * 5 + 4]
        else:
            return ANUSWARA
    else:
        return ANUSWARA


[docs]def fix_anuswara(text: str) -> str:
    """
    Check every anuswaara in the text and change to anunaasika if applicable
    """
    output_chars = []
    if text:
        for idx in range(len(text) - 1):
            char = text[idx]
            next_char = text[idx + 1]
            if char == ANUSWARA and next_char in VARGIYA:
                anunasika = get_anunaasika(next_char)
                output_chars.append(anunasika)
                output_chars.append(HALANTA)
            else:
                output_chars.append(char)
        output_chars.append(text[-1])
    return "".join(output_chars)


###############################################################################


[docs]def get_syllables_word(word: str, technical: bool = False) -> List[str]:
    """Get syllables from a Sanskrit (Devanagari) word

    Parameters
    ----------
    word : str
        Sanskrit (Devanagari) word to get syllables from.
        Spaces, if present, are ignored.
    technical : bool, optional
        If True, ensures that each element contains at most
        one Swara or Vyanjana.
        The default is False.

    Returns
    -------
    List[str]
        List of syllables
    """
    word = clean(word, spaces=False)
    wlen = len(word)
    word_syllables = []

    current = ""
    i = 0
    while i < wlen:
        curr_ch = word[i]
        current += curr_ch
        i += 1
        # words split to start at START_CHARS
        start_chars = VARNA + SPECIAL
        if technical:
            start_chars += AYOGAVAAHA_COMMON
        while i < wlen and word[i] not in start_chars:
            current += word[i]
            i += 1
        if current[-1] != HALANTA or i == wlen or technical:
            word_syllables.append(current)
            current = ""
    return word_syllables


[docs]def get_syllables(text: str, technical: bool = False) -> List[List[List[str]]]:
    """Get syllables from a Sanskrit (Devanagari) text

    Parameters
    ----------
    text : str
        Sanskrit (Devanagari) text to get syllables from
    technical : bool, optional
        If True, ensures that each element contains at most
        one Swara or Vyanjana.
        The default is False.

    Returns
    -------
    List[List[List[str]]]
        List of syllables in a nested list format
        Nesting Levels: Text -> Lines -> Words
    """
    lines = split_lines(text.strip())
    syllables = []
    for line in lines:
        words = line.split()
        line_syllables = []
        for word in words:
            word_syllables = get_syllables_word(word, technical)
            line_syllables.append(word_syllables)
        syllables.append(line_syllables)
    return syllables


###############################################################################


[docs]def split_varna_word(word: str, technical: bool = True) -> List[str]:
    """Obtain the Varna decomposition of a Sanskrit (Devanagari) word

    Parameters
    ----------
    word : str
        Sanskrit (Devanagari) word to be split.
    technical : bool, optional
        If True, a split, vowels and vowel signs are treated independently
        which is more useful for analysis,
        The default is True.

    Returns
    -------
    List[str]
        List of Varna
    """
    word_syllables = get_syllables_word(word, True)
    word_viccheda = []
    for syllable in word_syllables:
        if syllable[0] in ALL_SWARA:
            word_viccheda.append(syllable[0])
            if len(syllable) > 1:
                word_viccheda.append(syllable[1])
            # TODO: Will this ever be the case?
            if len(syllable) > 2:
                LOGGER.warning(f"Long SWARA: {syllable}")
                word_viccheda.append(syllable[2:])
        elif syllable[0] in ALL_VYANJANA:
            word_viccheda.append(syllable[0] + HALANTA)
            if len(syllable) == 1:
                word_viccheda.append(ARTIFICIAL_MATRA_A)
            if len(syllable) > 1:
                if syllable[1] in AYOGAVAAHA_COMMON:
                    word_viccheda.append(ARTIFICIAL_MATRA_A)
                if syllable[1] != HALANTA:
                    word_viccheda.append(syllable[1])
            # TODO: Will this ever be the case?
            if len(syllable) > 2:
                LOGGER.warning(f"Long VYANJANA: {syllable}")
                word_viccheda.append(syllable[2:])
        else:
            word_viccheda.append(syllable)

    if not technical:
        real_word_viccheda = []
        for varna in word_viccheda:
            if varna in ARTIFICIAL_MATRA + ALL_MATRA:
                real_word_viccheda.append(marker_to_swara(varna))
            elif varna in AYOGAVAAHA_COMMON:
                real_word_viccheda[-1] += varna
            else:
                real_word_viccheda.append(varna)
        word_viccheda = real_word_viccheda
    return word_viccheda


[docs]def split_varna(
    text: str, technical: bool = True, flat: bool = False
) -> List[List[List[str]]] or List[str]:
    """Obtain the Varna decomposition of a Sanskrit (Devanagari) text

    Parameters
    ----------
    word : str
        Sanskrit (Devanagari) text to be split.
    technical : bool, optional
        If True, a split, vowels and vowel signs are treated independently
        which is more useful for analysis,
        The default is True.
    flat : bool, optional
        If True, a single list is returned instead of nested lists.
        The default is False.

    Returns
    -------
    List[List[List[str]]] or List[str]

        Varna decomposition of the text in a nested list format.
        Nesting Levels: Text -> Lines -> Words

        - Varna decomposition of each word is a List[char].
        - List of Varna decomposition of each word from a line.
        - List of Varna decomposition of each line from the text.

        If `flat=True`, Varna decomposition of the entire text is presented
        as a single list, also containing whitespace markers.
        Lines are separated by a newline character '\\n' and words are
        separated by a space character ' '.
    """

    lines = split_lines(text.strip())
    viccheda = []
    num_lines = len(lines)
    for line_idx, line in enumerate(lines):
        words = line.split()
        line_viccheda = []
        num_words = len(words)
        for word_idx, word in enumerate(words):
            word_viccheda = split_varna_word(word, technical)
            if flat:
                line_viccheda.extend(word_viccheda)
                if word_idx != num_words - 1:
                    line_viccheda.append(" ")
            else:
                line_viccheda.append(word_viccheda)
        if flat:
            viccheda.extend(line_viccheda)
            if line_idx != num_lines - 1:
                viccheda.append("\n")
        else:
            viccheda.append(line_viccheda)
    return viccheda


[docs]def join_varna(viccheda: str, technical: bool = True) -> str:
    """
    Join Varna decomposition to form a Sanskrit (Devanagari) word

    Parameters
    ----------
    viccheda : list
        Viccheda output obtained by `split_varna_word` with `technical=True`
        (or output of `split_varna` with `technical=True` and `flat=True`)
        IMPORTANT: `technical=True` is necessary.
    technical : bool
        WARNING: Currently unused.
        Value of the same parameter passed to `split_varna_word`


    NOTE
    ----
        Currently only works for the viccheda generated with `technical=True`

    Returns
    -------
    s : str
        Sanskrit word
    """
    word = []
    i = 0
    while i < len(viccheda):
        curr_varna = viccheda[i]
        next_varna = ""
        if i < len(viccheda) - 1:
            next_varna = viccheda[i + 1]

        i += 1

        if curr_varna in [" ", "\n"]:
            word.append(curr_varna)
            continue

        if curr_varna[0] in ALL_SWARA + SPECIAL:
            word.append(curr_varna[0])
            if curr_varna[-1] in AYOGAVAAHA_COMMON:
                word.append(curr_varna[-1])
        if curr_varna[-1] == HALANTA:
            if next_varna in [" ", "\n"]:
                word.append(curr_varna)
                continue
            if next_varna == "":
                word.append(curr_varna)
                break
            if next_varna[-1] == HALANTA:
                word.append(curr_varna)
            if next_varna[0] in ALL_SWARA:
                i += 1
                word.append(curr_varna[:-1])
                if next_varna[0] != SWARA[0]:
                    word.append(marker_to_swara(next_varna[0]))
                if next_varna[-1] == VISARGA:
                    # NOTE: This was mostly meant to handle version with
                    # `technical=False`
                    LOGGER.warning(
                        f"Next Varna is SWARA + VISARGA: {next_varna}"
                    )
                    word.append(next_varna[-1])
            if next_varna in AYOGAVAAHA_COMMON:
                i += 1
                word.append(curr_varna[:-1] + next_varna)
            if next_varna in ARTIFICIAL_MATRA + ALL_MATRA:
                i += 1
                word.append(curr_varna[:-1])
                if next_varna != ARTIFICIAL_MATRA_A:
                    word.append(next_varna)
        if curr_varna in ARTIFICIAL_MATRA + ALL_MATRA + AYOGAVAAHA_COMMON:
            word.append(curr_varna)

    return "".join(word)


###############################################################################

###############################################################################
# Ucchaarana Sthaana Module
# ------------------------

STHAANA = {
    "S_K": ["अ", "आ"] + KANTHYA + ["ह"] + [VISARGA],
    "S_T": ["इ", "ई"] + TALAVYA + ["य", "श"],
    "S_M": ["ऋ", "ॠ"] + MURDHANYA + ["र", "ष"],
    "S_D": ["ऌ", "ॡ"] + DANTYA + ["ल", "स"],
    "S_O": ["उ", "ऊ"] + AUSHTHYA + [UPADHMANIYA],
    "S_N": VARGA_PANCHAMA + [ANUSWARA],
    "S_KT": ["ए", "ऐ"],
    "S_KO": ["ओ", "औ"],
    "S_DO": ["व"],
    "S_JM": [JIHVAAMULIYA],
}

STHAANA_NAMES = {
    "S_K": "कण्ठः",
    "S_T": "तालु",
    "S_M": "मूर्धा",
    "S_D": "दन्ताः",
    "S_O": "ओष्ठौ",
    "S_N": "नासिका",
    "S_KT": "कण्ठतालु",
    "S_KO": "कण्ठौष्ठम्",
    "S_DO": "दन्तौष्ठम्",
    "S_JM": "जिह्वामूलम्",
}

###############################################################################

AABHYANTARA = {
    "A_SP": VARGIYA,
    "A_ISP": ANTAHSTHA,
    "A_IVVT": USHMA + [JIHVAAMULIYA, UPADHMANIYA],
    "A_VVT": SWARA[1:] + [CHANDRABINDU, ANUSWARA, VISARGA],
    "A_SVT": SWARA[:1],
}

AABHYANTARA_NAMES = {
    "A_SP": "स्पृष्टः",
    "A_ISP": "ईषत्स्पृष्टः",
    "A_IVVT": "ईषद्विवृतः",
    "A_VVT": "विवृतः",
    "A_SVT": "संवृतः",
}

###############################################################################

BAAHYA = {
    "B_VVR": resolve_pratyaahaara("खर्")[0],
    "B_SVR": resolve_pratyaahaara("हश्")[0] + SWARA,
    "B_SW": resolve_pratyaahaara("खर्")[0],
    "B_ND": resolve_pratyaahaara("हश्")[0] + SWARA,
    "B_GH": resolve_pratyaahaara("हश्")[0] + SWARA,
    "B_AGH": resolve_pratyaahaara("खर्")[0],
    "B_AP": (
        VARGA_PRATHAMA
        + VARGA_TRITIYA
        + VARGA_PANCHAMA
        + resolve_pratyaahaara("यण्")[0]
    )
    + SWARA
    + [CHANDRABINDU, ANUSWARA],
    "B_MP": (VARGA_DWITIYA + VARGA_CHATURTHA + resolve_pratyaahaara("शल्")[0])
    + [VISARGA, JIHVAAMULIYA, UPADHMANIYA],
    "B_U": SWARA,
    "B_ANU": [s + ANUDATTA for s in SWARA],
    "B_SWA": [s + SWARITA for s in SWARA],
}

BAAHYA_NAMES = {
    "B_VVR": "विवारः",
    "B_SVR": "संवारः",
    "B_SW": "श्वासः",
    "B_ND": "नादः",
    "B_GH": "घोषः",
    "B_AGH": "अघोषः",
    "B_AP": "अल्पप्राणः",
    "B_MP": "महाप्राणः",
    "B_U": "उदात्तः",
    "B_ANU": "अनुदात्तः",
    "B_SWA": "स्वरितः",
}

###############################################################################

UCCHAARANA = dict(**STHAANA, **AABHYANTARA, **BAAHYA)
UCCHAARANA_NAMES = dict(**STHAANA_NAMES, **AABHYANTARA_NAMES, **BAAHYA_NAMES)

###############################################################################


[docs]def get_ucchaarana_vector(letter: str, abbrev=False) -> Dict[str, int]:
    """
    Get ucchaarana sthaana and prayatna based vector of a letter

    Parameters
    ----------
    letter : str
        Sanskrit letter
    abbrev : bool
        If True, the output will contain English abbreviations
        otherwise, the output will contain Sanskrit names.
        The default is False.

    Returns
    -------
    vector : Dict[str, int]
        One-hot vector indicating utpatti sthaana, aabhyantara prayatna and
        baahya prayatna of a letter
    """
    varna = letter.replace(HALANTA, "") if letter.endswith(HALANTA) else letter
    if abbrev:

        def ucchaarana_name(s):
            return s

    else:

        def ucchaarana_name(s):
            return UCCHAARANA_NAMES[s]

    vector = {ucchaarana_name(k): 0 for k in UCCHAARANA}
    for s, varna_list in UCCHAARANA.items():
        if varna in varna_list:
            vector[ucchaarana_name(s)] = 1

    return vector


[docs]def get_ucchaarana_vectors(
    word: str, abbrev: bool = False
) -> List[Tuple[str, Dict[str, int]]]:
    """
    Get ucchaarana sthaana and prayatna based vector of a word or text

    Parameters
    ----------
    word : str
        Sanskrit word (or text)
    abbrev : bool
        If True, the output will contain English abbreviations
        otherwise, the output will contain Sanskrit names.
        The default is False.

    Returns
    -------
    vectors : List[Tuple[str, Dict[str, int]]]
        List of (letter, vector)
    """
    letters = []
    for letter in split_varna_word(word, technical=False):
        if [v for v in AYOGAVAAHA_COMMON if v in letter]:
            letters.extend(letter)
        else:
            letters.append(letter)
    return [
        (letter, get_ucchaarana_vector(letter, abbrev)) for letter in letters
    ]


###############################################################################


[docs]def get_signature_letter(letter: str, abbrev: bool = False) -> Dict[str, str]:
    """
    Get ucchaarana sthaana and prayatna based signature of a letter

    Parameters
    ----------
    letter : str
        Sanskrit letter
    abbrev : bool
        If True, the output will contain English abbreviations
        otherwise, the output will contain Sanskrit names.
        The default is False.

    Returns
    -------
    signature : Dict[str, str]
        utpatti sthaana, aabhyantara prayatna and baahya prayatna of a letter
    """
    sthaana = get_ucchaarana_letter(letter, dimension=0, abbrev=abbrev)
    aabhyantara = get_ucchaarana_letter(letter, dimension=1, abbrev=abbrev)
    baahya = get_ucchaarana_letter(letter, dimension=2, abbrev=abbrev)

    signature = {
        "sthaana": sthaana,
        "aabhyantara": aabhyantara,
        "baahya": baahya,
    }
    return signature


[docs]def get_signature_word(
    word: str, abbrev: bool = False
) -> List[Tuple[str, Dict[str, str]]]:
    """
    Get ucchaarana sthaana and prayatna based signature of a word

    Parameters
    ----------
    word : str
        Sanskrit word (or text)
        Caution: If multiple words are provided,
        the spaces are not included in the output list.
    abbrev : bool
        If True, the output will contain English abbreviations
        otherwise, the output will contain Sanskrit names.
        The default is False.

    Returns
    -------
    List[Tuple[str, Dict[str, str]]]
        List of (letter, signature)

    """
    letters = []
    for letter in split_varna_word(word, technical=False):
        if [v for v in AYOGAVAAHA_COMMON if v in letter]:
            letters.extend(letter)
        else:
            letters.append(letter)
    return [
        (letter, get_signature_letter(letter, abbrev)) for letter in letters
    ]


[docs]def get_signature(
    text: str, abbrev: bool = False
) -> List[List[List[Tuple[str, Dict[str, str]]]]]:
    """
    Get ucchaarana list of a Sanskrit text

    Parameters
    ----------
    text : str
        Sanskrit text (can contain newlines, spaces)
    abbrev : bool
        If True, the output will contain English abbreviations
        otherwise, the output will contain Sanskrit names.
        The default is False.

    Returns
    -------
    List[List[List[Tuple[str, Dict[str, str]]]]]
        List of (letter, signature) for words in a nested list format
        Nesting Levels: Text -> Lines -> Words
    """
    lines = split_lines(text.strip())
    signature = []
    for line in lines:
        words = line.split()
        line_signature = []
        for word in words:
            word_signature = get_signature_word(word, abbrev)
            line_signature.append(word_signature)
        signature.append(line_signature)
    return signature


###############################################################################


[docs]def get_ucchaarana_letter(
    letter: str, dimension: int = 0, abbrev: bool = False
) -> str:
    """
    Get ucchaarana sthaana or prayatna of a letter

    Parameters
    ----------
    letter : str
        Sanskrit letter
    dimension : int
        - 0: sthaana
        - 1: aabhyantara prayatna
        - 2: baahya prayatna

        The default is 0.
    abbrev : bool
        If True,
            The output will contain English abbreviations
        Otherwise,
            The output will contain Sanskrit names

        The default is False.

    Returns
    -------
    str
        ucchaarana sthaana or prayatna of a letter
    """
    varna = letter.replace(HALANTA, "") if letter.endswith(HALANTA) else letter
    ucchaarana = []
    _UCCHAARANA = [STHAANA, AABHYANTARA, BAAHYA]
    _NAMES = [STHAANA_NAMES, AABHYANTARA_NAMES, BAAHYA_NAMES]

    if abbrev:

        def ucchaarana_name(s):
            return s

        join_str = "-"
    else:

        def ucchaarana_name(s):
            return _NAMES[dimension][s]

        join_str = " "

    for s, varna_list in _UCCHAARANA[dimension].items():
        if varna in varna_list:
            ucchaarana.append(ucchaarana_name(s))

    if len(ucchaarana) > 1 and not abbrev:
        ucchaarana.append("च")

    return join_str.join(ucchaarana)


[docs]def get_ucchaarana_word(
    word: str, dimension: int = 0, abbrev: bool = False
) -> List[Tuple[str, str]]:
    """
    Get ucchaarana of a word

    Parameters
    ----------
    word : str
        Sanskrit word (or text)

        **Caution**: If multiple words are provided, the spaces are not included in
        the output list
    dimension : int
        - 0: sthaana
        - 1: aabhyantara prayatna
        - 2: baahya prayatna

        The default is 0.
    abbrev : bool
        If True,
            The output will contain English abbreviations
        Otherwise,
            The output will contain Sanskrit names

        The default is False.

    Returns
    -------
    List[Tuple[str, str]]
        List of (letter, ucchaarana)

    """
    letters = []
    for letter in split_varna_word(word, technical=False):
        if [v for v in AYOGAVAAHA_COMMON if v in letter]:
            letters.extend(letter)
        else:
            letters.append(letter)
    return [
        (letter, get_ucchaarana_letter(letter, dimension, abbrev))
        for letter in letters
    ]


[docs]def get_ucchaarana(
    text: str, dimension: int = 0, abbrev: bool = False
) -> List[List[List[Tuple[str, str]]]]:
    """
    Get ucchaarana list of a Sanskrit text

    Parameters
    ----------
    text : str
        Sanskrit text (can contain newlines, spaces)
    dimension : int
        - 0: sthaana
        - 1: aabhyantara prayatna
        - 2: baahya prayatna

        The default is 0.
    abbrev : bool
        If True,
            The output will contain English abbreviations
        Otherwise,
            The output will contain Sanskrit names

        The default is False.

    Returns
    -------
    List[List[List[Tuple[str, str]]]]
        List of (letter, ucchaarana) for words in a nested list format
        Nesting Levels: Text -> Lines -> Words
    """
    lines = split_lines(text.strip())
    ucchaarana = []
    for line in lines:
        words = line.split()
        line_ucchaarana = []
        for word in words:
            word_ucchaarana = get_ucchaarana_word(word, dimension, abbrev)
            line_ucchaarana.append(word_ucchaarana)
        ucchaarana.append(line_ucchaarana)
    return ucchaarana


###############################################################################


[docs]def get_sthaana_letter(letter: str, abbrev: bool = False):
    """Wrapper for get_ucchaarana_letter for sthaana"""
    return get_ucchaarana_letter(letter, dimension=0, abbrev=abbrev)


[docs]def get_sthaana_word(word: str, abbrev: bool = False):
    """Wrapper for get_ucchaarana_word for sthaana"""
    return get_ucchaarana_word(word, dimension=0, abbrev=abbrev)


[docs]def get_sthaana(text: str, abbrev: bool = False):
    """Wrapper for get_ucchaarana for sthaana"""
    return get_ucchaarana(text, dimension=0, abbrev=abbrev)


# --------------------------------------------------------------------------- #


[docs]def get_aabhyantara_letter(letter: str, abbrev: bool = False):
    """Wrapper for get_ucchaarana_letter for aabhyantara"""
    return get_ucchaarana_letter(letter, dimension=1, abbrev=abbrev)


[docs]def get_aabhyantara_word(word: str, abbrev: bool = False):
    """Wrapper for get_ucchaarana_word for aabhyantara"""
    return get_ucchaarana_word(word, dimension=1, abbrev=abbrev)


[docs]def get_aabhyantara(text: str, abbrev: bool = False):
    """Wrapper for get_ucchaarana for aabhyantara"""
    return get_ucchaarana(text, dimension=1, abbrev=abbrev)


# --------------------------------------------------------------------------- #


[docs]def get_baahya_letter(letter: str, abbrev: bool = False):
    """Wrapper for get_ucchaarana_letter for baahya"""
    return get_ucchaarana_letter(letter, dimension=2, abbrev=abbrev)


[docs]def get_baahya_word(word: str, abbrev: bool = False):
    """Wrapper for get_ucchaarana_word for baahya"""
    return get_ucchaarana_word(word, dimension=2, abbrev=abbrev)


[docs]def get_baahya(text: str, abbrev: bool = False):
    """Wrapper for get_ucchaarana for baahya"""
    return get_ucchaarana(text, dimension=2, abbrev=abbrev)


###############################################################################