import Cedict from "./cedict";

export interface IEntry {
  pinyin: string;
  pinyinPretty: string;
  english: string;
  traditional: string;
  simplified: string;
}

export interface IPosition {
  offset: number;
  line: number;
  column: number;
}

export interface IToken {
  text: string;
  matches: IEntry[];
  position: IPosition;
}

const chinesePunctuation = [
  "·",
  "×",
  "—",
  "‘",
  "’",
  "“",
  "”",
  "…",
  "、",
  "。",
  "《",
  "》",
  "『",
  "』",
  "【",
  "】",
  "！",
  "（",
  "）",
  "，",
  "：",
  "；",
  "？",
];

async function fetchCedict(): Promise<string> {
  const resp = await fetch("/cedict_ts.u8.txt");
  return resp.text();
}

class Tokenizer {
  private static instance: Tokenizer;
  private dictionary!: Cedict;
  private isLoaded!: boolean;
  public constructor() {
    if (Tokenizer.instance) {
      return Tokenizer.instance;
    }
    Tokenizer.instance = this;
    this.dictionary = new Cedict();
    this.isLoaded = false;
    return Tokenizer.instance;
  }
  public async Tokenize(t: string) {
    if (!this.isLoaded) {
      try {
        const cedictContents = await fetchCedict();
        // NOTE: the second check here is because a second Tokenize() call may execute before fetchCedict returns.
        // This logic doesn't block the redundant fetch, but will block a redundant dictionary.load().
        if (!this.isLoaded) {
          this.dictionary.load(cedictContents);
          this.isLoaded = true;
        }
      } catch {
        // Couldn't load required data, so there's nothing to do.
        return [{ text: t, matches: [], position: { offset: 0, line: 0, column: 0 } }];
      }
    }
    const text = Array.from(t.replace(/\r/g, ""));

    const result: IToken[] = [];
    let i = 0;
    let [offset, line, column] = [0, 1, 1];
    let [simplifiedPreference, traditionalPreference] = [0, 0];

    const pushToken = (word: string) => {
      const simplifiedEntries = this.dictionary.get(word, false);
      const traditionalEntries = this.dictionary.get(word, true);

      const entries: IEntry[] =
        simplifiedEntries.length === 0
          ? traditionalEntries
          : traditionalEntries.length === 0
            ? simplifiedEntries
            : simplifiedPreference < traditionalPreference
              ? traditionalEntries
              : simplifiedPreference > traditionalPreference
                ? simplifiedEntries
                : [...simplifiedEntries, ...traditionalEntries];

      if (traditionalEntries.length === 0 && simplifiedEntries.length > 0) {
        simplifiedPreference++;
      } else if (simplifiedEntries.length === 0 && traditionalEntries.length > 0) {
        traditionalPreference++;
      }

      result.push({
        text: word,

        position: {
          offset,
          line,
          column,
        },

        matches: entries?.map(({ pinyin, pinyinPretty, english }) => ({
          traditional: entries[0] ? entries[0].traditional : word,
          simplified: entries[0] ? entries[0].simplified : word,
          pinyin,
          pinyinPretty,
          english,
        })),
      } as IToken);

      const wordArr = Array.from(word);
      const lastLineBreakIndex = word.lastIndexOf("\n");

      i += wordArr.length;
      offset += word.length;
      line += wordArr.filter((x) => x === "\n").length;
      column = lastLineBreakIndex >= 0 ? word.length - lastLineBreakIndex : column + word.length;
    };

    while (i < text.length) {
      // Try to match two or more characters

      if (i !== text.length - 1) {
        const getTwo = text.slice(i, i + 2).join("");
        const simplifiedEntries = this.dictionary.getPrefix(getTwo, false);
        const traditionalEntries = this.dictionary.getPrefix(getTwo, true);
        let foundWord: string | null = null;
        let foundEntries: IEntry[] = [];

        for (const entries of [traditionalEntries, simplifiedEntries]) {
          for (const entry of entries) {
            const matchText = entries === traditionalEntries ? entry.traditional : entry.simplified;
            const word = text.slice(i, i + Array.from(matchText).length).join("");

            if (
              matchText === word &&
              (foundWord === null || Array.from(word).length > Array.from(foundWord).length)
            ) {
              foundWord = word;
              foundEntries = entries;
            }
          }
        }

        if (foundWord !== null) {
          pushToken(foundWord);

          if (foundEntries === simplifiedEntries) {
            simplifiedPreference++;
          } else if (foundEntries === traditionalEntries) {
            traditionalPreference++;
          }

          continue;
        }
      }

      // If it fails, match one character

      const character = text[i];
      const isChinese = (character: string) =>
        chinesePunctuation.includes(character) ||
        this.dictionary.get(character, false).length > 0 ||
        this.dictionary.get(character, true).length > 0;

      if (isChinese(character) || character.match(/\s/) !== null) {
        pushToken(character);
        continue;
      }

      // Handle non-Chinese characters

      let end = i + 1;

      for (; end < text.length; end++) {
        if (text[end].match(/\s/) !== null || isChinese(text[end])) break;
      }

      const word = text.slice(i, end).join("");
      pushToken(word);
    }

    return result;
  }
}

export function Tokenize(s: string) {
  const t = new Tokenizer();
  return t.Tokenize(s);
}
