-
Notifications
You must be signed in to change notification settings - Fork 87
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Reimplement ZawgyiDetector without numpy * Require Python 3.8
- Loading branch information
1 parent
b6a4762
commit 92a7052
Showing
9 changed files
with
195 additions
and
182 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,29 +1,5 @@ | ||
''' | ||
Myanmar Tools | ||
============= | ||
"""Tools for processing font encodings used in Myanmar.""" | ||
|
||
Myanmar Tools implements tools for processing font encodings used in Myanmar. | ||
It currently supports Zawgyi detection. | ||
To detect Zawgyi, create an instance of ZawgyiDetector, and call | ||
``get_zawgyi_probability`` with a string:: | ||
from myanmartools import ZawgyiDetector | ||
detector = ZawgyiDetector() | ||
score = detector.get_zawgyi_probability('မ္း') | ||
# score is now 0.999772 (very likely Zawgyi) | ||
For Zawgyi-to-Unicode conversion, you can use the ICU library. Install it | ||
using ``pip install PyICU``. | ||
To convert Zawgyi to Unicode, create an instance of ICU Transliterator with | ||
the transform ID "Zawgyi-my", and call :code:`transiliterate` with a string:: | ||
from icu import Transliterator | ||
converter = Transliterator.createInstance('Zawgyi-my') | ||
output = converter.transliterate('မ္း') | ||
# output is now 'မ်း' | ||
''' | ||
from .zawgyi_detector import ZawgyiDetector | ||
|
||
__all__ = ['ZawgyiDetector'] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,80 +1,93 @@ | ||
from importlib.resources import open_binary | ||
import numpy as np | ||
"""Helper functions for reading parameters of the model file.""" | ||
|
||
from array import array | ||
from itertools import chain, repeat | ||
import struct | ||
from typing import BinaryIO, cast, Final, Iterator, Tuple | ||
|
||
# Myanmar Unicode characters before digits | ||
STD: Final = range(0x1000, 0x103F + 1) | ||
# Myanmar Unicode characters after digits | ||
AFT: Final = range(0x104A, 0x109F + 1) | ||
# Myanmar Extended-A Unicode characters | ||
EXA: Final = range(0xAA60, 0xAA7F + 1) | ||
# Myanmar Extended-B Unicode characters | ||
EXB: Final = range(0xA9E0, 0xA9FF + 1) | ||
# Unicode space characters | ||
SPC: Final = range(0x2000, 0x200B + 1) | ||
|
||
|
||
def check_signature(stream: BinaryIO) -> str: | ||
""" | ||
Check signature of the model file and return characters used by the model. | ||
The characters returned are sorted in lexicographical order. | ||
""" | ||
uzmodel_tag = stream.read(8) | ||
if uzmodel_tag != b'UZMODEL ': | ||
raise IOError('invalid uzmodel_tag') | ||
uzmodel_version = read_int(stream) | ||
|
||
if uzmodel_version == 1: | ||
ssv = 0 | ||
elif uzmodel_version == 2: | ||
ssv = read_int(stream) | ||
else: | ||
raise IOError('invalid uzmodel_version') | ||
|
||
if ssv == 0: | ||
chars = ''.join(map(chr, chain(STD, AFT, EXA, EXB, SPC))) | ||
elif ssv == 1: | ||
chars = ''.join(map(chr, chain(STD, AFT, EXA, EXB))) | ||
else: | ||
raise ValueError('invalid ssv') | ||
|
||
bmarkov_tag = stream.read(8) | ||
if bmarkov_tag != b'BMARKOV ': | ||
raise IOError('invalid bmarkov_tag') | ||
bmarkov_version = read_int(stream) | ||
if bmarkov_version != 0: | ||
raise IOError('invalid bmarkov_version') | ||
|
||
return chars | ||
|
||
|
||
def read_params(stream: BinaryIO) -> 'array[float]': | ||
"""Read parameters in the model file.""" | ||
size = read_short(stream) | ||
params = array('f', repeat(0, size * size)) | ||
for i in range(size): | ||
count = read_short(stream) | ||
if count != 0: | ||
offset = i * size | ||
# set default value | ||
value = read_float(stream) | ||
for index in range(size): | ||
params[offset + index] = value | ||
# set special values | ||
for index, value in read_pairs(stream, count): | ||
params[offset + index] = value | ||
return params | ||
|
||
|
||
def read_short(stream: BinaryIO) -> int: | ||
"""Read a short integer value in big-endian order.""" | ||
return cast(int, struct.unpack('>h', stream.read(2))[0]) | ||
|
||
|
||
def read_int(stream: BinaryIO) -> int: | ||
"""Read an integer value in big-endian order.""" | ||
return cast(int, struct.unpack('>i', stream.read(4))[0]) | ||
|
||
|
||
def read_float(stream: BinaryIO) -> float: | ||
"""Read a float value in big-endian order.""" | ||
return cast(float, struct.unpack('>f', stream.read(4))[0]) | ||
|
||
|
||
def get_mapping(): | ||
''' | ||
Generates a mapping of Myanmar Unicode characters to their corresponding | ||
indices in the parameter array. | ||
Returns | ||
------- | ||
dict | ||
A mapping from Myanmar Unicode characters to indices. | ||
''' | ||
def get_chars(start, end): | ||
return [chr(char) for char in range(ord(start), ord(end) + 1)] | ||
|
||
chars = (get_chars('\u1000', '\u103F') | ||
+ get_chars('\u104A', '\u109F') | ||
+ get_chars('\uAA60', '\uAA7F') | ||
+ get_chars('\uA9E0', '\uA9FF') | ||
+ get_chars('\u2000', '\u200B')) | ||
|
||
return {char: i + 1 for i, char in enumerate(chars)} | ||
|
||
def load_params(): | ||
''' | ||
Loads parameters as a 2d array, which are log likelihood ratios of | ||
Unicode to Zawgyi. | ||
Returns | ||
------- | ||
numpy.ndarray | ||
Parameters as a 2d array. | ||
''' | ||
def read_char_array(f, size): | ||
return struct.unpack(f'{size}s', f.read(size))[0].decode('utf-8') | ||
|
||
def read_float(f): | ||
return struct.unpack('>f', f.read(4))[0] | ||
|
||
def read_int(f): | ||
return struct.unpack('>i', f.read(4))[0] | ||
|
||
def read_short(f): | ||
return struct.unpack('>h', f.read(2))[0] | ||
|
||
with open_binary('myanmartools.resources', 'zawgyiUnicodeModel.dat') as f: | ||
# check signature | ||
uzmodel_tag = read_char_array(f, 8) | ||
if uzmodel_tag != 'UZMODEL ': | ||
raise IOError('incorrect uzmodel_tag') | ||
uzmodel_version = read_int(f) | ||
if uzmodel_version == 1: | ||
ssv = 0 | ||
elif uzmodel_version == 2: | ||
ssv = read_int(f) | ||
else: | ||
raise IOError('incorrect uzmodel_version') | ||
bmarkov_tag = read_char_array(f, 8) | ||
if bmarkov_tag != 'BMARKOV ': | ||
raise IOError('incorrect bmarkov_tag') | ||
bmarkov_version = read_int(f) | ||
if bmarkov_version != 0: | ||
raise IOError('incorrect bmarkov_version') | ||
|
||
# read params | ||
size = read_short(f) | ||
params = np.empty((size, size)) | ||
for row in range(size): | ||
count = read_short(f) | ||
if count != 0: | ||
params[row] = read_float(f) | ||
for i in range(count): | ||
col = read_short(f) | ||
params[row, col] = read_float(f) | ||
else: | ||
params[row] = 0 | ||
|
||
return params | ||
def read_pairs(stream: BinaryIO, n: int) -> Iterator[Tuple[int, float]]: | ||
"""Read n int-float value pairs in big-endian order.""" | ||
return cast( | ||
Iterator[Tuple[int, float]], | ||
struct.iter_unpack('>hf', stream.read(6 * n)) | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
""" | ||
Resources for myanmartools. | ||
This peckage contains: | ||
- `zawgyiUnicodeModel.dat` - parameters of Zawgyi detector | ||
- `compatability.tsv` - Zawgyi probabilities and input strings | ||
resulting from Java implementation of Zawgyi detector | ||
""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,50 +1,69 @@ | ||
import numpy as np | ||
from ._params import get_mapping, load_params | ||
"""Zawgyi detector module.""" | ||
|
||
from bisect import bisect_left | ||
from importlib.resources import open_binary | ||
from itertools import chain, filterfalse | ||
from math import exp, inf, isnan, nan | ||
from typing import Iterator, Optional | ||
|
||
from ._params import check_signature, read_params | ||
|
||
|
||
class ZawgyiDetector: | ||
''' | ||
An estimator that predicts Zawgyi using two Markov chains, one | ||
for Unicode text and the other for Zawgyi text. | ||
Attributes | ||
---------- | ||
mapping : dict | ||
A mapping of Myanmar Unicode characters to the corresponding indices | ||
in the parameter array. | ||
params : numpy.ndarray | ||
A parameter array containing log likelihood ratios of | ||
Unicode to Zawgyi. | ||
''' | ||
def __init__(self): | ||
self.mapping = get_mapping() | ||
self.params = load_params() | ||
|
||
def get_zawgyi_probability(self, string): | ||
''' | ||
Computes Zawgyi probability. | ||
Parameters | ||
---------- | ||
string : str | ||
String to predict Zawgyi on. | ||
Returns | ||
------- | ||
float | ||
Zawgyi probability between 0 and 1, or negative infinity | ||
if there is no Myanmar Unicode character. | ||
''' | ||
indices = [self.mapping.get(char, 0) for char in string] | ||
# include starting and ending state probabilities | ||
previous = np.array([0] + indices) | ||
current = np.array(indices + [0]) | ||
# ignore 0-to-0 transitions | ||
mask = np.logical_or(previous != 0, current != 0) | ||
# return negative inifinity if there are only 0-to-0 transitions, | ||
# which happens when there is no Myanmar Unicode character | ||
if not mask.any(): | ||
return -np.inf | ||
# Pz/(Pu+Pz) = exp(logPz)/(exp(logPu)+exp(logPz)) | ||
# = 1/(1+exp(logPu-logPz)) | ||
return 1.0 / (1.0 + | ||
np.exp(self.params[previous[mask], current[mask]].sum())) | ||
"""A detector of Myanmar Zawgyi encoding.""" | ||
|
||
__slots__ = ['_chars', '_params'] | ||
|
||
def __init__(self) -> None: | ||
"""Intialize the detector.""" | ||
with open_binary( | ||
'myanmartools.resources', | ||
'zawgyiUnicodeModel.dat' | ||
) as stream: | ||
self._chars = check_signature(stream) | ||
self._params = read_params(stream) | ||
# the 0 node is for foreign characters so mark as nan | ||
self._params[0] = nan | ||
|
||
def _state(self, char: Optional[str]) -> int: | ||
""" | ||
Return the state of a character. | ||
Return 0 for foreign characters. | ||
""" | ||
if char is None: | ||
return 0 | ||
i = bisect_left(self._chars, char) | ||
if i < len(self._chars) and self._chars[i] == char: | ||
return i + 1 | ||
return 0 | ||
|
||
def _llrs(self, string: str) -> Iterator[float]: | ||
""" | ||
Return the log-likelihood ratios of consecutive character pairs. | ||
The first and last characters are paired with None on the left | ||
and right respectively. | ||
""" | ||
size = len(self._chars) + 1 | ||
return map( | ||
lambda i, j: self._params[self._state(i) * size + self._state(j)], | ||
chain((None,), string), | ||
chain(string, (None,)) | ||
) | ||
|
||
def get_zawgyi_probability(self, string: str) -> float: | ||
""" | ||
Return the Zawgyi probability of a string. | ||
Return negative infinity if there are only foreign characters. | ||
""" | ||
if all(map(isnan, self._llrs(string))): | ||
return -inf | ||
total = sum(filterfalse(isnan, self._llrs(string))) | ||
# Pz/(Pu+Pz) = exp(lnPz)/(exp(lnPu)+exp(lnPz)) = 1/(1+exp(lnPu-lnPz)) | ||
# prevent overflow when positive | ||
if total >= 0: | ||
z = exp(-total) | ||
return z / (z + 1) | ||
return 1 / (1 + exp(total)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
"""Tests for myanmartools.""" |
Oops, something went wrong.