package hivemall.tools.text;

import java.util.ArrayList;
import java.util.List;
import java.util.StringTokenizer;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.hive.ql.udf.UDFType;
import org.apache.hadoop.io.Text;

@UDFType(deterministic = true, stateful = false)
@Description(name = "tokenize", value = "_FUNC_(string englishText [, boolean toLowerCase]) - Returns tokenized words in array<string>")
/* loaded from: input_file:hivemall/tools/text/TokenizeUDF.class */
public final class TokenizeUDF extends UDF {
    private static final String DELIM = " .,?!:;()<>[]\b\t\n\f\r\"'\\";

    public List<Text> evaluate(Text text) {
        return evaluate(text, false);
    }

    public List<Text> evaluate(Text text, boolean z) {
        if (text == null) {
            return null;
        }
        ArrayList arrayList = new ArrayList();
        StringTokenizer stringTokenizer = new StringTokenizer(text.toString(), DELIM);
        while (stringTokenizer.hasMoreElements()) {
            String nextToken = stringTokenizer.nextToken();
            if (z) {
                nextToken = nextToken.toLowerCase();
            }
            arrayList.add(new Text(nextToken));
        }
        return arrayList;
    }
}
