package sec.bdc.tm.hte.eu.ngram.tokenizer;

import com.google.common.collect.Lists;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.function.Function;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.lang3.StringUtils;

/* loaded from: classes49.dex */
public class Twokenizer implements Tokenizer {
    private static final String AA1 = "([A-Za-z]\\.){2,}(?=($|\\s|[“\\\"?!,:;]|&(amp|lt|gt|quot);))";
    private static final String AA2 = "[^A-Za-z]([A-Za-z]\\.)+[A-Za-z](?=($|\\s|[“\\\"?!,:;]|&(amp|lt|gt|quot);))";
    private static final String ARBITRARY_ABBREV = "(([A-Za-z]\\.){2,}(?=($|\\s|[“\\\"?!,:;]|&(amp|lt|gt|quot);))|[^A-Za-z]([A-Za-z]\\.)+[A-Za-z](?=($|\\s|[“\\\"?!,:;]|&(amp|lt|gt|quot);))|\\b([Mm]r|[Mm]rs|[Mm]s|[Dd]r|[Ss]r|[Jj]r|[Rr]ep|[Ss]en|[Ss]t)\\.)";
    private static final String BOUNDARY_NOT_DOT = "($|\\s|[“\\\"?!,:;]|&(amp|lt|gt|quot);)";
    private static final String COMMON_TOP_LEVEL_DOMAINS = "(com|co\\.uk|org|net|info|ca|ly)";
    private static final String DECORATIONS = "[♫]+";
    private static final String EDGE_PUNCT = "['\\\"“”‘’<>«»{}\\(\\)\\[\\]]";
    private static final String EDGE_PUNCT_CHARS = "'\\\"“”‘’<>«»{}\\(\\)\\[\\]";
    private static final String EMBEDDED_APOSTROPHE = "[^\\s\\.,]+'[^\\s\\.,]+";
    private static final String EMOTICON = "((?iu)[:=]|[;])(|o|O|-)([pP]|[doO/\\\\]|[\\(\\[]|[D\\)\\]])";
    private static final String ENTITY = "&(amp|lt|gt|quot);";
    private static final String HAPPY_MOUTHS = "[D\\)\\]]";
    private static final String NORMAL_EYES = "(?iu)[:=]";
    private static final String NOSE_AREA = "(|o|O|-)";
    private static final String NOT_EDGE_PUNCT = "[a-zA-Z0-9]";
    private static final String NUMBER_WITH_COMMAS = "(\\d+,)+?\\d{3}(?=([^,]|$))";
    private static final String NUM_NUM = "\\d+\\.\\d+";
    private static final String OTHER_MOUTHS = "[doO/\\\\]";
    private static final String PUNCT_CHARS = "['“\\\".?!,:;]";
    private static final String PUNCT_SEQ = "['“\\\".?!,:;]+";
    private static final String SAD_MOUTHS = "[\\(\\[]";
    private static final String SEPARATORS = "(--+|―)";
    private static final String STANDARD_ABBREVIATIONS = "\\b([Mm]r|[Mm]rs|[Mm]s|[Dd]r|[Ss]r|[Jj]r|[Rr]ep|[Ss]en|[Ss]t)\\.";
    private static final String THINGS_THAT_SPLIT_WORDS = "[^\\s\\.,]";
    private static final String TIME_LIKE = "\\d+:\\d+";
    private static final String TONGUE = "[pP]";
    private static final String URL = "\\b((https?://|www\\.)|[A-Za-z0-9\\.-]+?\\.(com|co\\.uk|org|net|info|ca|ly)(?=[/ \\W]))[^ \\t\\r\\n<>]*?(?=((['“\\\".?!,:;]|&(amp|lt|gt|quot);)+?)?(\\.\\.+|[<>]|\\s|$))";
    private static final String URL_BODY = "[^ \\t\\r\\n<>]*?";
    private static final String URL_END = "(\\.\\.+|[<>]|\\s|$)";
    private static final String URL_EXTRA_CRAP_BEFORE_END = "(['“\\\".?!,:;]|&(amp|lt|gt|quot);)+?";
    private static final String URL_START_1 = "(https?://|www\\.)";
    private static final String URL_START_2 = "[A-Za-z0-9\\.-]+?\\.(com|co\\.uk|org|net|info|ca|ly)(?=[/ \\W])";
    private static final String WINK = "[;]";
    private static final Pattern CONTRACTIONS = Pattern.compile("(?i)^(\\w+)(n't|'ve|'ll|'d|'re|'s|'m)$");
    private static final Pattern WHITESPACE = Pattern.compile("\\s+");
    private static final Pattern EDGE_PUNCT_LEFT = Pattern.compile("(\\s|^)(['\\\"“”‘’<>«»{}\\(\\)\\[\\]]+)([a-zA-Z0-9])");
    private static final Pattern EDGE_PUNCT_RIGHT = Pattern.compile("([a-zA-Z0-9])(['\\\"“”‘’<>«»{}\\(\\)\\[\\]]+)(\\s|$)");
    private static final Pattern PROTECTED_PATTERN = Pattern.compile("(((?iu)[:=]|[;])(|o|O|-)([pP]|[doO/\\\\]|[\\(\\[]|[D\\)\\]])|\\b((https?://|www\\.)|[A-Za-z0-9\\.-]+?\\.(com|co\\.uk|org|net|info|ca|ly)(?=[/ \\W]))[^ \\t\\r\\n<>]*?(?=((['“\\\".?!,:;]|&(amp|lt|gt|quot);)+?)?(\\.\\.+|[<>]|\\s|$))|&(amp|lt|gt|quot);|\\d+:\\d+|\\d+\\.\\d+|(\\d+,)+?\\d{3}(?=([^,]|$))|['“\\\".?!,:;]+|(([A-Za-z]\\.){2,}(?=($|\\s|[“\\\"?!,:;]|&(amp|lt|gt|quot);))|[^A-Za-z]([A-Za-z]\\.)+[A-Za-z](?=($|\\s|[“\\\"?!,:;]|&(amp|lt|gt|quot);))|\\b([Mm]r|[Mm]rs|[Mm]s|[Dd]r|[Ss]r|[Jj]r|[Rr]ep|[Ss]en|[Ss]t)\\.)|(--+|―)|[♫]+|[^\\s\\.,]+'[^\\s\\.,]+)");

    private List<String> calculateFinalTokens(List<String> list) {
        return (List) list.stream().flatMap(new Function(this) { // from class: sec.bdc.tm.hte.eu.ngram.tokenizer.Twokenizer$$Lambda$0
            private final Twokenizer arg$1;

            /* JADX INFO: Access modifiers changed from: package-private */
            {
                this.arg$1 = this;
            }

            @Override // java.util.function.Function
            public Object apply(Object obj) {
                return this.arg$1.bridge$lambda$0$Twokenizer((String) obj);
            }
        }).filter(Twokenizer$$Lambda$1.$instance).collect(Collectors.toList());
    }

    private static List<String> calculateZipped(List<List<String>> list, List<String> list2) {
        ArrayList newArrayList = Lists.newArrayList();
        if (list.size() == list2.size()) {
            for (int i = 0; i < list.size(); i++) {
                newArrayList.addAll(list.get(i));
                newArrayList.add(list2.get(i));
            }
        } else {
            for (int i2 = 0; i2 < list.size() - 1; i2++) {
                newArrayList.addAll(list.get(i2));
                newArrayList.add(list2.get(i2));
            }
            newArrayList.addAll(list.get(list.size() - 1));
        }
        return newArrayList;
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    public static final /* synthetic */ boolean lambda$calculateFinalTokens$0$Twokenizer(String str) {
        return !str.isEmpty();
    }

    private List<String> simpleTokenize(String str) {
        String splitEdgePunct = splitEdgePunct(str);
        Matcher matcher = PROTECTED_PATTERN.matcher(splitEdgePunct);
        ArrayList newArrayList = Lists.newArrayList();
        while (matcher.find()) {
            int start = matcher.start();
            int end = matcher.end();
            if (start != end) {
                newArrayList.add(Integer.valueOf(start));
                newArrayList.add(Integer.valueOf(end));
            }
        }
        int length = splitEdgePunct.length();
        ArrayList newArrayList2 = Lists.newArrayList();
        newArrayList2.add(0);
        newArrayList2.addAll(newArrayList);
        newArrayList2.add(Integer.valueOf(length));
        ArrayList newArrayList3 = Lists.newArrayList();
        for (int i = 0; i < newArrayList2.size(); i += 2) {
            newArrayList3.add(Arrays.asList(splitEdgePunct.substring(((Integer) newArrayList2.get(i)).intValue(), ((Integer) newArrayList2.get(i + 1)).intValue()).trim().split(StringUtils.SPACE)));
        }
        ArrayList newArrayList4 = Lists.newArrayList();
        for (int i2 = 0; i2 < newArrayList.size(); i2 += 2) {
            newArrayList4.add(splitEdgePunct.substring(((Integer) newArrayList.get(i2)).intValue(), ((Integer) newArrayList.get(i2 + 1)).intValue()));
        }
        return calculateFinalTokens(calculateZipped(newArrayList3, newArrayList4));
    }

    private String splitEdgePunct(String str) {
        return EDGE_PUNCT_RIGHT.matcher(EDGE_PUNCT_LEFT.matcher(str).replaceAll("$1$2 $3")).replaceAll("$1 $2$3");
    }

    /* JADX INFO: Access modifiers changed from: private */
    /* renamed from: splitToken, reason: merged with bridge method [inline-methods] */
    public Stream<String> bridge$lambda$0$Twokenizer(String str) {
        Matcher matcher = CONTRACTIONS.matcher(str);
        ArrayList newArrayList = Lists.newArrayList();
        while (matcher.find()) {
            newArrayList.add(matcher.group(1).trim());
            newArrayList.add(matcher.group(2).trim());
        }
        if (newArrayList.isEmpty()) {
            newArrayList.add(str.trim());
        }
        return newArrayList.stream();
    }

    private String squeezeWhitespace(String str) {
        return WHITESPACE.matcher(str).replaceAll(StringUtils.SPACE).trim();
    }

    @Override // sec.bdc.tm.hte.eu.ngram.tokenizer.Tokenizer
    public List<String> tokenize(String str) {
        return simpleTokenize(squeezeWhitespace(str));
    }
}
