Re-implement python client (#65)

* Reimplement ZawgyiDetector without numpy * Require Python 3.8
google · Dec 29, 2020 · 92a7052 · 92a7052
1 parent b6a4762
commit 92a7052
Show file tree

Hide file tree

Showing 9 changed files with 195 additions and 182 deletions.
diff --git a/.gitignore b/.gitignore
@@ -12,3 +12,7 @@ build/
 # checking in composer.lock causes failures due to incompatible PHP versions
 composer.lock
 composer.phar
+# python
+__pycache__/
+venv/
+*.egg-info/
diff --git a/.travis.yml b/.travis.yml
@@ -120,7 +120,7 @@ matrix:
 
     # Python Client
     - language: python
-      python: 3.7
+      python: 3.8
       before_script:
         - cd clients/python
         - python setup.py install

diff --git a/clients/python/setup.py b/clients/python/setup.py
@@ -10,14 +10,13 @@
     author='William (Wai Yan) Zhu',
     author_email='williamzhu345@gmail.com',
     classifiers=[
-	'Development Status :: 4 - Beta',
-	'Intended Audience :: Developers',
-	'Intended Audience :: Science/Research',
-	'License :: OSI Approved :: Apache Software License',
-	'Operating System :: OS Independent',
-	'Programming Language :: Python :: 3.7',
-	'Programming Language :: Python :: 3.8',
-	'Topic :: Text Processing'
+        'Development Status :: 4 - Beta',
+        'Intended Audience :: Developers',
+        'Intended Audience :: Science/Research',
+        'License :: OSI Approved :: Apache Software License',
+        'Operating System :: OS Independent',
+        'Programming Language :: Python :: 3.8',
+        'Topic :: Text Processing'
     ],
     license='Apache License, Version 2.0',
     description='Tools for processing font encodings used in Myanmar',
@@ -29,6 +28,5 @@
     package_dir={'': 'src'},
     include_package_data=True,
     package_data={'myanmartools': ['resources/*']},
-    install_requires=['numpy>=1.18'],
-    python_requires='>=3.7'
+    python_requires='>=3.8'
 )
diff --git a/clients/python/src/myanmartools/__init__.py b/clients/python/src/myanmartools/__init__.py
@@ -1,29 +1,5 @@
-'''
-Myanmar Tools
-=============
+"""Tools for processing font encodings used in Myanmar."""
 
-Myanmar Tools implements tools for processing font encodings used in Myanmar.
-It currently supports Zawgyi detection.
-
-To detect Zawgyi, create an instance of ZawgyiDetector, and call
-``get_zawgyi_probability`` with a string::
-
-    from myanmartools import ZawgyiDetector
-
-    detector = ZawgyiDetector()
-    score = detector.get_zawgyi_probability('မ္း')
-    # score is now 0.999772 (very likely Zawgyi)
-
-For Zawgyi-to-Unicode conversion, you can use the ICU library. Install it
-using ``pip install PyICU``.
-
-To convert Zawgyi to Unicode, create an instance of ICU Transliterator with
-the transform ID "Zawgyi-my", and call :code:`transiliterate` with a string::
-
-    from icu import Transliterator
-
-    converter = Transliterator.createInstance('Zawgyi-my')
-    output = converter.transliterate('မ္း')
-    # output is now 'မ်း'
-'''
 from .zawgyi_detector import ZawgyiDetector
+
+__all__ = ['ZawgyiDetector']
diff --git a/clients/python/src/myanmartools/_params.py b/clients/python/src/myanmartools/_params.py
@@ -1,80 +1,93 @@
-from importlib.resources import open_binary
-import numpy as np
+"""Helper functions for reading parameters of the model file."""
+
+from array import array
+from itertools import chain, repeat
 import struct
+from typing import BinaryIO, cast, Final, Iterator, Tuple
+
+# Myanmar Unicode characters before digits
+STD: Final = range(0x1000, 0x103F + 1)
+# Myanmar Unicode characters after digits
+AFT: Final = range(0x104A, 0x109F + 1)
+# Myanmar Extended-A Unicode characters
+EXA: Final = range(0xAA60, 0xAA7F + 1)
+# Myanmar Extended-B Unicode characters
+EXB: Final = range(0xA9E0, 0xA9FF + 1)
+# Unicode space characters
+SPC: Final = range(0x2000, 0x200B + 1)
+
+
+def check_signature(stream: BinaryIO) -> str:
+    """
+    Check signature of the model file and return characters used by the model.
+
+    The characters returned are sorted in lexicographical order.
+    """
+    uzmodel_tag = stream.read(8)
+    if uzmodel_tag != b'UZMODEL ':
+        raise IOError('invalid uzmodel_tag')
+    uzmodel_version = read_int(stream)
+
+    if uzmodel_version == 1:
+        ssv = 0
+    elif uzmodel_version == 2:
+        ssv = read_int(stream)
+    else:
+        raise IOError('invalid uzmodel_version')
+
+    if ssv == 0:
+        chars = ''.join(map(chr, chain(STD, AFT, EXA, EXB, SPC)))
+    elif ssv == 1:
+        chars = ''.join(map(chr, chain(STD, AFT, EXA, EXB)))
+    else:
+        raise ValueError('invalid ssv')
+
+    bmarkov_tag = stream.read(8)
+    if bmarkov_tag != b'BMARKOV ':
+        raise IOError('invalid bmarkov_tag')
+    bmarkov_version = read_int(stream)
+    if bmarkov_version != 0:
+        raise IOError('invalid bmarkov_version')
+
+    return chars
+
+
+def read_params(stream: BinaryIO) -> 'array[float]':
+    """Read parameters in the model file."""
+    size = read_short(stream)
+    params = array('f', repeat(0, size * size))
+    for i in range(size):
+        count = read_short(stream)
+        if count != 0:
+            offset = i * size
+            # set default value
+            value = read_float(stream)
+            for index in range(size):
+                params[offset + index] = value
+            # set special values
+            for index, value in read_pairs(stream, count):
+                params[offset + index] = value
+    return params
+
+
+def read_short(stream: BinaryIO) -> int:
+    """Read a short integer value in big-endian order."""
+    return cast(int, struct.unpack('>h', stream.read(2))[0])
+
+
+def read_int(stream: BinaryIO) -> int:
+    """Read an integer value in big-endian order."""
+    return cast(int, struct.unpack('>i', stream.read(4))[0])
+
+
+def read_float(stream: BinaryIO) -> float:
+    """Read a float value in big-endian order."""
+    return cast(float, struct.unpack('>f', stream.read(4))[0])
+
 
-def get_mapping():
-    '''
-    Generates a mapping of Myanmar Unicode characters to their corresponding
-    indices in the parameter array.
-
-    Returns
-    -------
-    dict
-        A mapping from Myanmar Unicode characters to indices.
-    '''
-    def get_chars(start, end):
-        return [chr(char) for char in range(ord(start), ord(end) + 1)]
-
-    chars = (get_chars('\u1000', '\u103F')
-            + get_chars('\u104A', '\u109F')
-            + get_chars('\uAA60', '\uAA7F')
-            + get_chars('\uA9E0', '\uA9FF')
-            + get_chars('\u2000', '\u200B'))
-
-    return {char: i + 1 for i, char in enumerate(chars)}
-
-def load_params():
-    '''
-    Loads parameters as a 2d array, which are log likelihood ratios of
-    Unicode to Zawgyi.
-
-    Returns
-    -------
-    numpy.ndarray
-        Parameters as a 2d array.
-    '''
-    def read_char_array(f, size):
-        return struct.unpack(f'{size}s', f.read(size))[0].decode('utf-8')
-
-    def read_float(f):
-        return struct.unpack('>f', f.read(4))[0]
-
-    def read_int(f):
-        return struct.unpack('>i', f.read(4))[0]
-
-    def read_short(f):
-        return struct.unpack('>h', f.read(2))[0]
-
-    with open_binary('myanmartools.resources', 'zawgyiUnicodeModel.dat') as f:
-        # check signature
-        uzmodel_tag = read_char_array(f, 8)
-        if uzmodel_tag != 'UZMODEL ':
-            raise IOError('incorrect uzmodel_tag')
-        uzmodel_version = read_int(f)
-        if uzmodel_version == 1:
-            ssv = 0
-        elif uzmodel_version == 2:
-            ssv = read_int(f)
-        else:
-            raise IOError('incorrect uzmodel_version')
-        bmarkov_tag = read_char_array(f, 8)
-        if bmarkov_tag != 'BMARKOV ':
-            raise IOError('incorrect bmarkov_tag')
-        bmarkov_version = read_int(f)
-        if bmarkov_version != 0:
-            raise IOError('incorrect bmarkov_version')
-
-        # read params
-        size = read_short(f)
-        params = np.empty((size, size))
-        for row in range(size):
-            count = read_short(f)
-            if count != 0:
-                params[row] = read_float(f)
-                for i in range(count):
-                    col = read_short(f)
-                    params[row, col] = read_float(f)
-            else:
-                params[row] = 0
-
-        return params
+def read_pairs(stream: BinaryIO, n: int) -> Iterator[Tuple[int, float]]:
+    """Read n int-float value pairs in big-endian order."""
+    return cast(
+        Iterator[Tuple[int, float]],
+        struct.iter_unpack('>hf', stream.read(6 * n))
+    )
diff --git a/clients/python/src/myanmartools/resources/__init__.py b/clients/python/src/myanmartools/resources/__init__.py
@@ -0,0 +1,9 @@
+"""
+Resources for myanmartools.
+
+This peckage contains:
+
+- `zawgyiUnicodeModel.dat` - parameters of Zawgyi detector
+- `compatability.tsv` - Zawgyi probabilities and input strings
+  resulting from Java implementation of Zawgyi detector
+"""
diff --git a/clients/python/src/myanmartools/zawgyi_detector.py b/clients/python/src/myanmartools/zawgyi_detector.py
@@ -1,50 +1,69 @@
-import numpy as np
-from ._params import get_mapping, load_params
+"""Zawgyi detector module."""
+
+from bisect import bisect_left
+from importlib.resources import open_binary
+from itertools import chain, filterfalse
+from math import exp, inf, isnan, nan
+from typing import Iterator, Optional
+
+from ._params import check_signature, read_params
+
 
 class ZawgyiDetector:
-    '''
-    An estimator that predicts Zawgyi using two Markov chains, one
-    for Unicode text and the other for Zawgyi text.
-
-    Attributes
-    ----------
-    mapping : dict
-        A mapping of Myanmar Unicode characters to the corresponding indices
-        in the parameter array.
-    params : numpy.ndarray
-        A parameter array containing log likelihood ratios of
-        Unicode to Zawgyi.
-    '''
-    def __init__(self):
-        self.mapping = get_mapping()
-        self.params = load_params()
-
-    def get_zawgyi_probability(self, string):
-        '''
-        Computes Zawgyi probability.
-
-        Parameters
-        ----------
-        string : str
-            String to predict Zawgyi on.
-
-        Returns
-        -------
-        float
-            Zawgyi probability between 0 and 1, or negative infinity
-            if there is no Myanmar Unicode character.
-        '''
-        indices = [self.mapping.get(char, 0) for char in string]
-        # include starting and ending state probabilities
-        previous = np.array([0] + indices)
-        current = np.array(indices + [0])
-        # ignore 0-to-0 transitions
-        mask = np.logical_or(previous != 0, current != 0)
-        # return negative inifinity if there are only 0-to-0 transitions,
-        # which happens when there is no Myanmar Unicode character
-        if not mask.any():
-            return -np.inf
-        # Pz/(Pu+Pz) = exp(logPz)/(exp(logPu)+exp(logPz))
-        #            = 1/(1+exp(logPu-logPz))
-        return 1.0 / (1.0 + 
-            np.exp(self.params[previous[mask], current[mask]].sum()))
+    """A detector of Myanmar Zawgyi encoding."""
+
+    __slots__ = ['_chars', '_params']
+
+    def __init__(self) -> None:
+        """Intialize the detector."""
+        with open_binary(
+            'myanmartools.resources',
+            'zawgyiUnicodeModel.dat'
+        ) as stream:
+            self._chars = check_signature(stream)
+            self._params = read_params(stream)
+            # the 0 node is for foreign characters so mark as nan
+            self._params[0] = nan
+
+    def _state(self, char: Optional[str]) -> int:
+        """
+        Return the state of a character.
+
+        Return 0 for foreign characters.
+        """
+        if char is None:
+            return 0
+        i = bisect_left(self._chars, char)
+        if i < len(self._chars) and self._chars[i] == char:
+            return i + 1
+        return 0
+
+    def _llrs(self, string: str) -> Iterator[float]:
+        """
+        Return the log-likelihood ratios of consecutive character pairs.
+
+        The first and last characters are paired with None on the left
+        and right respectively.
+        """
+        size = len(self._chars) + 1
+        return map(
+            lambda i, j: self._params[self._state(i) * size + self._state(j)],
+            chain((None,), string),
+            chain(string, (None,))
+        )
+
+    def get_zawgyi_probability(self, string: str) -> float:
+        """
+        Return the Zawgyi probability of a string.
+
+        Return negative infinity if there are only foreign characters.
+        """
+        if all(map(isnan, self._llrs(string))):
+            return -inf
+        total = sum(filterfalse(isnan, self._llrs(string)))
+        # Pz/(Pu+Pz) = exp(lnPz)/(exp(lnPu)+exp(lnPz)) = 1/(1+exp(lnPu-lnPz))
+        # prevent overflow when positive
+        if total >= 0:
+            z = exp(-total)
+            return z / (z + 1)
+        return 1 / (1 + exp(total))
diff --git a/clients/python/tests/__init__.py b/clients/python/tests/__init__.py
@@ -0,0 +1 @@
+"""Tests for myanmartools."""