package com.seg.preproc;

import java.util.ArrayList;
import java.util.List;

/* loaded from: classes.dex */
public class Tokenizer {
    public boolean isNumber(String str) {
        return str.matches("-*[\\d]+\\.[\\d]+");
    }

    public void splitWord(String str, List<String> list, boolean z) {
        int i = 0;
        StringBuilder sb = null;
        while (i < str.length()) {
            int codePointAt = str.codePointAt(i);
            String valueOf = String.valueOf(Character.toChars(codePointAt));
            if ((!z || codePointAt <= 127) && (z || !valueOf.matches("\\p{Punct}$"))) {
                if (sb == null) {
                    sb = new StringBuilder();
                }
                sb.append(valueOf);
            } else {
                if (sb != null) {
                    list.add(sb.toString());
                    sb = null;
                }
                list.add(valueOf);
            }
            i += Character.charCount(codePointAt);
        }
        if (sb != null) {
            list.add(sb.toString());
        }
    }

    public List<String> tokenizeForCJK(String str) {
        String[] split = str.trim().split("\\s+");
        ArrayList arrayList = new ArrayList();
        for (String str2 : split) {
            splitWord(str2, arrayList, true);
        }
        return arrayList;
    }

    public List<String> tokenizeForWestern(String str) {
        String[] split = str.trim().split("\\s+");
        ArrayList arrayList = new ArrayList();
        for (String str2 : split) {
            if (isNumber(str2)) {
                arrayList.add(str2);
            } else {
                splitWord(str2, arrayList, false);
            }
        }
        return arrayList;
    }
}
