import { getTokenizer } from "kuromojin";
import { IJapaneseToken } from "./components/japaneseWithFurigana";

const KATAKANA_HIRAGANA_SHIFT = "\u3041".charCodeAt(0) - "\u30a1".charCodeAt(0);
// FUTURE: enable output to katakana
// const HIRAGANA_KATAKANA_SHIFT = "\u30a1".charCodeAt(0) - "\u3041".charCodeAt(0);

export function isKanji(ch: string): boolean {
  ch = ch[0];
  return (
    (ch >= "\u4e00" && ch <= "\u9fcf") ||
    (ch >= "\uf900" && ch <= "\ufaff") ||
    (ch >= "\u3400" && ch <= "\u4dbf")
  );
}

export function hasKanji(str: string) {
  for (let i = 0; i < str.length; i++) {
    if (isKanji(str[i])) return true;
  }
  return false;
}

export function toRawHiragana(str: string) {
  if (!str || str === "") {
    return "";
  }
  return [...str]
    .map((ch) => {
      if (ch > "\u30a0" && ch < "\u30f7") {
        return String.fromCharCode(ch.charCodeAt(0) + KATAKANA_HIRAGANA_SHIFT);
      }
      return ch;
    })
    .join("");
}

const furiganaOverrideRegex = /^(.*)「「(.+)」」$/;
export async function tokenize(japanese: string): Promise<IJapaneseToken[]> {
  const overrideMatch = japanese.match(furiganaOverrideRegex);
  if (overrideMatch) {
    const surface_form = overrideMatch[1];
    const reading = overrideMatch[2];
    return [{ surface_form, reading }];
  }

  // getTokenizer uses a built in cache
  const t = await getTokenizer({ dicPath: "/dict/" });
  return t.tokenize(japanese);
}

export async function reading(japanese: string): Promise<string> {
  const tokens = await tokenize(japanese);
  const reading = tokens.map((tok) => tok.reading).join("");
  return reading;
}

// Surface form is the full, original word, including kanji and hiragana.
// NOTE: this function should only be called for a single word. Not for
// multiple words at a time.
export function readingHiraganaToFuriganaHTML(hiragana: string, surface_form: string): string {
  // In Japanese, a word can be,
  // a) hiragana alone,
  // b) katakana alone,
  // c) kanji followed by hiragana (called okurigana).

  // Furigana, the hiragana annotations above the kanji in a japanese word,
  // should only be shown above the kanji themselves. Showing furigana above
  // okurigana (themselves hiragana) would be redundant.

  // Because okurigana always come after the kanji in a word, we work from
  // back to front of the word, hiding the okurigana that are present in
  // the hiragana reading, one-by-one.

  let html = hiragana;
  for (let i = surface_form.length - 1; i >= 0; i--) {
    const char = surface_form[i];
    const lastIndex = html.lastIndexOf(char);
    if (lastIndex !== -1) {
      html =
        html.substring(0, lastIndex) +
        `<span style="opacity:0">${char}</span>` +
        html.substring(lastIndex + 1);
    }
  }
  return html;
}
