package opennlp.tools.tokenize;

import java.io.IOException;
import java.io.ObjectStreamException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;
import opennlp.model.MaxentModel;
import opennlp.model.TrainUtil;
import opennlp.tools.dictionary.Dictionary;
import opennlp.tools.tokenize.lang.Factory;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.Span;
import opennlp.tools.util.TrainingParameters;
import opennlp.tools.util.model.ModelUtil;

/* loaded from: input_file:resources/install/10/tika-bundle-1.10.jar:opennlp-tools-1.5.3.jar:opennlp/tools/tokenize/TokenizerME.class */
public class TokenizerME extends AbstractTokenizer {
    public static final String SPLIT = "T";
    public static final String NO_SPLIT = "F";

    @Deprecated
    public static final Pattern alphaNumeric = Pattern.compile(Factory.DEFAULT_ALPHANUMERIC);
    private final Pattern alphanumeric;
    private MaxentModel model;
    private final TokenContextGenerator cg;
    private boolean useAlphaNumericOptimization;
    private List<Double> tokProbs;
    private List<Span> newTokens;

    public TokenizerME(TokenizerModel tokenizerModel) {
        TokenizerFactory factory = tokenizerModel.getFactory();
        this.alphanumeric = factory.getAlphaNumericPattern();
        this.cg = factory.getContextGenerator();
        this.model = tokenizerModel.getMaxentModel();
        this.useAlphaNumericOptimization = factory.isUseAlphaNumericOptmization();
        this.newTokens = new ArrayList();
        this.tokProbs = new ArrayList(50);
    }

    public TokenizerME(TokenizerModel tokenizerModel, Factory factory) {
        String language = tokenizerModel.getLanguage();
        this.alphanumeric = factory.getAlphanumeric(language);
        this.cg = factory.createTokenContextGenerator(language, getAbbreviations(tokenizerModel.getAbbreviations()));
        this.model = tokenizerModel.getMaxentModel();
        this.useAlphaNumericOptimization = tokenizerModel.useAlphaNumericOptimization();
        this.newTokens = new ArrayList();
        this.tokProbs = new ArrayList(50);
    }

    private static Set<String> getAbbreviations(Dictionary dictionary) {
        return dictionary == null ? Collections.emptySet() : dictionary.asStringSet();
    }

    public double[] getTokenProbabilities() {
        double[] dArr = new double[this.tokProbs.size()];
        for (int i = 0; i < dArr.length; i++) {
            dArr[i] = this.tokProbs.get(i).doubleValue();
        }
        return dArr;
    }

    @Override // opennlp.tools.tokenize.Tokenizer
    public Span[] tokenizePos(String str) {
        Span[] spanArr = WhitespaceTokenizer.INSTANCE.tokenizePos(str);
        this.newTokens.clear();
        this.tokProbs.clear();
        for (Span span : spanArr) {
            String substring = str.substring(span.getStart(), span.getEnd());
            if (substring.length() < 2) {
                this.newTokens.add(span);
                this.tokProbs.add(Double.valueOf(1.0d));
            } else if (useAlphaNumericOptimization() && this.alphanumeric.matcher(substring).matches()) {
                this.newTokens.add(span);
                this.tokProbs.add(Double.valueOf(1.0d));
            } else {
                int start = span.getStart();
                int end = span.getEnd();
                int start2 = span.getStart();
                double d = 1.0d;
                for (int i = start2 + 1; i < end; i++) {
                    double[] eval = this.model.eval(this.cg.getContext(substring, i - start2));
                    String bestOutcome = this.model.getBestOutcome(eval);
                    d *= eval[this.model.getIndex(bestOutcome)];
                    if (bestOutcome.equals("T")) {
                        this.newTokens.add(new Span(start, i));
                        this.tokProbs.add(Double.valueOf(d));
                        start = i;
                        d = 1.0d;
                    }
                }
                this.newTokens.add(new Span(start, end));
                this.tokProbs.add(Double.valueOf(d));
            }
        }
        Span[] spanArr2 = new Span[this.newTokens.size()];
        this.newTokens.toArray(spanArr2);
        return spanArr2;
    }

    public static TokenizerModel train(ObjectStream<TokenSample> objectStream, TokenizerFactory tokenizerFactory, TrainingParameters trainingParameters) throws IOException {
        HashMap hashMap = new HashMap();
        return new TokenizerModel(TrainUtil.train(new TokSpanEventStream(objectStream, tokenizerFactory.isUseAlphaNumericOptmization(), tokenizerFactory.getAlphaNumericPattern(), tokenizerFactory.getContextGenerator()), trainingParameters.getSettings(), hashMap), hashMap, tokenizerFactory);
    }

    public static TokenizerModel train(String str, ObjectStream<TokenSample> objectStream, boolean z, TrainingParameters trainingParameters) throws IOException {
        return train(str, objectStream, (Dictionary) null, z, trainingParameters);
    }

    public static TokenizerModel train(String str, ObjectStream<TokenSample> objectStream, Dictionary dictionary, boolean z, TrainingParameters trainingParameters) throws IOException {
        Factory factory = new Factory();
        HashMap hashMap = new HashMap();
        return new TokenizerModel(str, TrainUtil.train(new TokSpanEventStream(objectStream, z, factory.getAlphanumeric(str), factory.createTokenContextGenerator(str, getAbbreviations(dictionary))), trainingParameters.getSettings(), hashMap), dictionary, z, hashMap);
    }

    @Deprecated
    public static TokenizerModel train(String str, ObjectStream<TokenSample> objectStream, boolean z, int i, int i2) throws IOException {
        return train(str, objectStream, z, ModelUtil.createTrainingParameters(i2, i));
    }

    public static TokenizerModel train(String str, ObjectStream<TokenSample> objectStream, boolean z) throws IOException, ObjectStreamException {
        return train(str, objectStream, z, 5, 100);
    }

    public boolean useAlphaNumericOptimization() {
        return this.useAlphaNumericOptimization;
    }
}
