# -*- coding: utf-8 -*-
"""Helpers to encode Japanese characters.
I doubt that this currently works correctly.
"""
try:
import jaconv
except ImportError:
jaconv = None
[docs]
def encode_katakana(text: str) -> bytes:
"""I don't think this quite works yet."""
encoded = []
for char in text:
if jaconv:
# try to convert japanese text to half-katakanas
char = jaconv.z2h(jaconv.hira2kata(char))
# TODO: "the conversion may result in multiple characters"
# If that really can happen (I am not really shure), than the string would have to be split and every single
# character has to passed through the following lines.
if char in TXT_ENC_KATAKANA_MAP:
encoded.append(TXT_ENC_KATAKANA_MAP[char])
else:
# TODO doesn't this discard all that is not in the map? Can we be sure that the input does contain only
# encodable characters? We could at least throw an exception if encoding is not possible.
pass
return b"".join(encoded)
TXT_ENC_KATAKANA_MAP = {
# Maps UTF-8 Katakana symbols to KATAKANA Page Codes
# TODO: has this really to be hardcoded?
# Half-Width Katakanas
"。": b"\xa1",
"「": b"\xa2",
"」": b"\xa3",
"、": b"\xa4",
"・": b"\xa5",
"ヲ": b"\xa6",
"ァ": b"\xa7",
"ィ": b"\xa8",
"ゥ": b"\xa9",
"ェ": b"\xaa",
"ォ": b"\xab",
"ャ": b"\xac",
"ュ": b"\xad",
"ョ": b"\xae",
"ッ": b"\xaf",
"ー": b"\xb0",
"ア": b"\xb1",
"イ": b"\xb2",
"ウ": b"\xb3",
"エ": b"\xb4",
"オ": b"\xb5",
"カ": b"\xb6",
"キ": b"\xb7",
"ク": b"\xb8",
"ケ": b"\xb9",
"コ": b"\xba",
"サ": b"\xbb",
"シ": b"\xbc",
"ス": b"\xbd",
"セ": b"\xbe",
"ソ": b"\xbf",
"タ": b"\xc0",
"チ": b"\xc1",
"ツ": b"\xc2",
"テ": b"\xc3",
"ト": b"\xc4",
"ナ": b"\xc5",
"ニ": b"\xc6",
"ヌ": b"\xc7",
"ネ": b"\xc8",
"ノ": b"\xc9",
"ハ": b"\xca",
"ヒ": b"\xcb",
"フ": b"\xcc",
"ヘ": b"\xcd",
"ホ": b"\xce",
"マ": b"\xcf",
"ミ": b"\xd0",
"ム": b"\xd1",
"メ": b"\xd2",
"モ": b"\xd3",
"ヤ": b"\xd4",
"ユ": b"\xd5",
"ヨ": b"\xd6",
"ラ": b"\xd7",
"リ": b"\xd8",
"ル": b"\xd9",
"レ": b"\xda",
"ロ": b"\xdb",
"ワ": b"\xdc",
"ン": b"\xdd",
"゙": b"\xde",
"゚": b"\xdf",
}