/*
 * Decompiled with CFR 0.152.
 */
package org.elasticsearch.xpack.ml.inference.nlp.tokenizers;

import com.ibm.icu.text.Normalizer;
import com.ibm.icu.text.Normalizer2;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.PrimitiveIterator;
import java.util.function.IntPredicate;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.util.AttributeSource;
import org.elasticsearch.core.CheckedFunction;
import org.elasticsearch.xpack.ml.inference.nlp.tokenizers.CharSeqTokenTrieNode;
import org.elasticsearch.xpack.ml.inference.nlp.tokenizers.ControlCharFilter;
import org.elasticsearch.xpack.ml.inference.nlp.tokenizers.DelimitedToken;

public final class BasicTokenFilter
extends TokenFilter {
    private final CharTermAttribute termAtt = (CharTermAttribute)this.addAttribute(CharTermAttribute.class);
    private final OffsetAttribute offsetAtt = (OffsetAttribute)this.addAttribute(OffsetAttribute.class);
    private final CharSeqTokenTrieNode neverSplit;
    private final LinkedList<DelimitedToken> tokens;
    private final boolean isStripAccents;
    private final CharArraySet neverSplitSet;
    private final Normalizer2 normalizer;
    private final StringBuilder accentBuffer = new StringBuilder();
    private final IntPredicate splitOn;
    private AttributeSource.State current;

    public static BasicTokenFilter build(final boolean isTokenizeCjkChars, final boolean isStripAccents, List<String> neverSplit, TokenStream input) throws IOException {
        CharSeqTokenTrieNode neverSplitTree;
        Analyzer analyzer = new Analyzer(){

            protected Analyzer.TokenStreamComponents createComponents(String fieldName) {
                WhitespaceTokenizer tokenizer = new WhitespaceTokenizer();
                BasicTokenFilter stream = new BasicTokenFilter((TokenStream)tokenizer, CharSeqTokenTrieNode.EMPTY, CharArraySet.EMPTY_SET, isStripAccents, isTokenizeCjkChars);
                return new Analyzer.TokenStreamComponents((Tokenizer)tokenizer, (TokenStream)stream);
            }

            protected Reader initReader(String fieldName, Reader reader) {
                return new ControlCharFilter(reader);
            }
        };
        CharArraySet neverSplitSet = new CharArraySet(neverSplit, false);
        try (Analyzer analyzer2 = analyzer;){
            neverSplitTree = CharSeqTokenTrieNode.build(neverSplit, (CheckedFunction<String, List<String>, IOException>)((CheckedFunction)c -> {
                try (TokenStream ts = analyzer.tokenStream("never_split", c);){
                    CharTermAttribute term = (CharTermAttribute)ts.addAttribute(CharTermAttribute.class);
                    ts.reset();
                    ArrayList<String> tokens = new ArrayList<String>();
                    while (ts.incrementToken()) {
                        tokens.add(term.toString());
                    }
                    ArrayList<String> arrayList = tokens;
                    return arrayList;
                }
            }));
        }
        return new BasicTokenFilter(input, neverSplitTree, neverSplitSet, isStripAccents, isTokenizeCjkChars);
    }

    public BasicTokenFilter(TokenStream input, CharSeqTokenTrieNode neverSplit, CharArraySet neverSplitSet, boolean isStripAccents, boolean isTokenizeCjkChars) {
        super(input);
        this.neverSplit = neverSplit;
        this.neverSplitSet = neverSplitSet;
        this.tokens = new LinkedList();
        this.isStripAccents = isStripAccents;
        this.normalizer = Normalizer2.getNFDInstance();
        this.splitOn = cp -> isTokenizeCjkChars && BasicTokenFilter.isCjkChar(cp) || BasicTokenFilter.isPunctuationMark(cp);
    }

    public void reset() throws IOException {
        super.reset();
        this.tokens.clear();
        this.accentBuffer.setLength(0);
        this.current = null;
    }

    public boolean incrementToken() throws IOException {
        if (!this.tokens.isEmpty()) {
            assert (this.current != null);
            DelimitedToken token = this.tokens.removeFirst();
            this.restoreState(this.current);
            this.termAtt.setEmpty().append(token.charSequence());
            this.offsetAtt.setOffset(token.startOffset(), token.endOffset());
            return true;
        }
        this.current = null;
        while (this.input.incrementToken()) {
            if (this.neverSplitSet.contains((CharSequence)this.termAtt)) {
                return true;
            }
            LinkedList<DelimitedToken> splits = this.split();
            LinkedList<DelimitedToken> delimitedTokens = this.mergeSplits(splits);
            if (this.isStripAccents) {
                for (DelimitedToken token : delimitedTokens) {
                    DelimitedToken stripped = this.stripAccent(token);
                    if (stripped.charSequence().isEmpty()) continue;
                    this.tokens.add(stripped);
                }
            } else {
                this.tokens.addAll(delimitedTokens);
            }
            this.current = this.captureState();
            if (this.tokens.isEmpty()) continue;
            DelimitedToken token = this.tokens.removeFirst();
            this.termAtt.setEmpty().append(token.charSequence());
            this.offsetAtt.setOffset(token.startOffset(), token.endOffset());
            return true;
        }
        return false;
    }

    private DelimitedToken stripAccent(DelimitedToken token) {
        this.accentBuffer.setLength(0);
        boolean changed = false;
        if (this.normalizer.quickCheck(token.charSequence()) != Normalizer.YES) {
            this.normalizer.normalize(token.charSequence(), this.accentBuffer);
            changed = true;
        } else {
            this.accentBuffer.append(token.charSequence());
        }
        ArrayList<Integer> badIndices = new ArrayList<Integer>();
        ArrayList<Integer> charCount = new ArrayList<Integer>();
        int index = 0;
        int deletedIndices = 0;
        PrimitiveIterator.OfInt it = this.accentBuffer.codePoints().iterator();
        while (it.hasNext()) {
            int cp = it.next();
            if (Character.getType(cp) == 6) {
                badIndices.add(index - deletedIndices);
                charCount.add(Character.charCount(cp));
                ++deletedIndices;
                changed = true;
            }
            ++index;
        }
        for (int i = 0; i < badIndices.size(); ++i) {
            int badIndex = (Integer)badIndices.get(i);
            int count = (Integer)charCount.get(i);
            for (int j = 0; j < count && badIndex < this.accentBuffer.length(); ++j) {
                this.accentBuffer.deleteCharAt(badIndex);
            }
        }
        if (changed) {
            return new DelimitedToken(this.accentBuffer.toString(), token.startOffset(), token.endOffset());
        }
        return token;
    }

    private LinkedList<DelimitedToken> split() {
        LinkedList<DelimitedToken> splits = new LinkedList<DelimitedToken>();
        int startOffset = this.offsetAtt.startOffset();
        int charIndex = 0;
        int lastCharSplit = 0;
        PrimitiveIterator.OfInt it = this.termAtt.codePoints().iterator();
        while (it.hasNext()) {
            int cp = it.next();
            if (this.splitOn.test(cp)) {
                int charCount = charIndex - lastCharSplit;
                if (charCount > 0) {
                    splits.add(new DelimitedToken(this.termAtt.subSequence(lastCharSplit, charIndex), lastCharSplit + startOffset, charIndex + startOffset));
                }
                splits.add(new DelimitedToken(this.termAtt.subSequence(charIndex, charIndex + 1), charIndex + startOffset, charIndex + 1 + startOffset));
                lastCharSplit = charIndex + 1;
            }
            charIndex += Character.charCount(cp);
        }
        if (lastCharSplit < this.termAtt.length()) {
            splits.add(new DelimitedToken(this.termAtt.subSequence(lastCharSplit, this.termAtt.length()), lastCharSplit + startOffset, this.offsetAtt.endOffset()));
        }
        return splits;
    }

    private LinkedList<DelimitedToken> mergeSplits(LinkedList<DelimitedToken> splits) {
        if (splits.size() == 1) {
            return splits;
        }
        LinkedList<DelimitedToken> mergedTokens = new LinkedList<DelimitedToken>();
        ArrayList<DelimitedToken> matchingTokens = new ArrayList<DelimitedToken>();
        CharSeqTokenTrieNode current = this.neverSplit;
        for (DelimitedToken token : splits) {
            CharSeqTokenTrieNode childNode = current.getChild(token.charSequence());
            if (childNode == null) {
                if (current != this.neverSplit) {
                    mergedTokens.addAll(matchingTokens);
                    matchingTokens = new ArrayList();
                    current = this.neverSplit;
                }
                if ((childNode = current.getChild(token.charSequence())) == null) {
                    mergedTokens.add(token);
                    continue;
                }
                matchingTokens.add(token);
                current = childNode;
                continue;
            }
            if (childNode.isLeaf()) {
                matchingTokens.add(token);
                DelimitedToken mergedToken = DelimitedToken.mergeTokens(matchingTokens);
                if (this.neverSplitSet.contains(mergedToken.charSequence())) {
                    mergedTokens.add(mergedToken);
                } else {
                    mergedTokens.addAll(matchingTokens);
                }
                matchingTokens = new ArrayList();
                current = this.neverSplit;
                continue;
            }
            matchingTokens.add(token);
            current = childNode;
        }
        if (!matchingTokens.isEmpty()) {
            mergedTokens.addAll(matchingTokens);
        }
        return mergedTokens;
    }

    static boolean isPunctuationMark(int codePoint) {
        if (codePoint >= 33 && codePoint <= 47 || codePoint >= 58 && codePoint <= 64 || codePoint >= 91 && codePoint <= 96 || codePoint >= 123 && codePoint <= 126) {
            return true;
        }
        int category = Character.getType(codePoint);
        return category >= 20 && category <= 24 || category >= 29 && category <= 30;
    }

    private static boolean isCjkChar(int codePoint) {
        Character.UnicodeBlock block = Character.UnicodeBlock.of(codePoint);
        return Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS.equals(block) || Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS.equals(block) || Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A.equals(block) || Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B.equals(block) || Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C.equals(block) || Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D.equals(block) || Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E.equals(block) || Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT.equals(block);
    }
}

