package sec.bdc.tm.hte.eu.ngram.extraction;

import com.google.common.collect.HashMultiset;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import sec.bdc.nlp.Language;
import sec.bdc.tm.hte.eu.preprocessing.resource.ResourceObject;

/* loaded from: classes49.dex */
public class NgramValidator {
    private static final Logger LOG = LoggerFactory.getLogger((Class<?>) NgramValidator.class);
    private static final Set<Language> SUPPORTED_LANGUAGES = ImmutableSet.of(Language.en, Language.es);
    private static final Map<String, String> PAIRED_CHARS = Maps.newHashMap();
    private static final Set<String> NON_STARTING_CHARS = Sets.newHashSet();
    private final Set<String> stopPrefixes = Sets.newHashSet();
    private final Set<String> stopSuffixes = Sets.newHashSet();
    private final Set<String> stopUnigrams = Sets.newHashSet();
    private final Set<String> stopNgrams = Sets.newHashSet();

    static {
        PAIRED_CHARS.put("\"", "\"");
        PAIRED_CHARS.put("(", ")");
        PAIRED_CHARS.put("'", "'");
        NON_STARTING_CHARS.add(")");
    }

    public NgramValidator(Language language) {
        if (!SUPPORTED_LANGUAGES.contains(language)) {
            LOG.warn("Language {} not fully supported by ngram extractor!", language);
            return;
        }
        String langPrefix = getLangPrefix(language);
        loadStringsFromFile(langPrefix + "stop_prefix.txt", this.stopPrefixes);
        loadStringsFromFile(langPrefix + "stop_suffix.txt", this.stopSuffixes);
        loadStringsFromFile(langPrefix + "stop_unigram.txt", this.stopUnigrams);
        loadStringsFromFile(langPrefix + "stop_ngram.txt", this.stopNgrams);
    }

    public NgramValidator(ResourceObject resourceObject) {
        Language language = resourceObject.getLanguage();
        if (!SUPPORTED_LANGUAGES.contains(language)) {
            LOG.warn("Language {} not fully supported by ngram extractor!", language);
            return;
        }
        String langPrefix = getLangPrefix(language);
        loadStringsFromResources(resourceObject, langPrefix + "stop_prefix.txt", this.stopPrefixes);
        loadStringsFromResources(resourceObject, langPrefix + "stop_suffix.txt", this.stopSuffixes);
        loadStringsFromResources(resourceObject, langPrefix + "stop_unigram.txt", this.stopUnigrams);
        loadStringsFromResources(resourceObject, langPrefix + "stop_ngram.txt", this.stopNgrams);
    }

    private static String getLangPrefix(Language language) {
        return language.toString().toLowerCase() + "/";
    }

    private static boolean hasInvalidCharacters(String str) {
        return !str.matches("[\\p{L}\\p{Nd}\"'#@\\-&_]+");
    }

    private boolean hasUnpairedTokens(List<String> list) {
        return hasUnpairedTokens(String.join(StringUtils.SPACE, list));
    }

    private boolean isInvalidUnigram(String str) {
        return isSingleCharacter(str) || isStopUnigram(str) || isNonYearNumber(str) || hasInvalidCharacters(str) || isWithoutLetterNumberOrQuote(str);
    }

    private static boolean isNonYearNumber(String str) {
        return str.matches("[0-9]+") && !str.matches("(19|20)[0-9][0-9]");
    }

    private static boolean isSingleCharacter(String str) {
        return str.length() == 1;
    }

    private boolean isStopUnigram(String str) {
        return this.stopUnigrams.contains(str);
    }

    private boolean isValidFirstToken(String str) {
        return (isWithoutLetterNumberOrQuote(str) || this.stopPrefixes.contains(str)) ? false : true;
    }

    private boolean isValidLastToken(String str) {
        return (isWithoutLetterNumberOrQuote(str) || this.stopSuffixes.contains(str)) ? false : true;
    }

    private boolean isValidNonUnigram(List<String> list) {
        if (this.stopNgrams.contains(String.join(StringUtils.SPACE, list)) || hasUnpairedTokens(list) || !isValidFirstToken(list.get(0))) {
            return false;
        }
        return isValidLastToken(list.get(list.size() - 1));
    }

    private static boolean isWithoutLetterNumberOrQuote(String str) {
        return !str.matches(".*[\\p{L}\\p{Nd}\"'].*");
    }

    private void loadStringsFromFile(String str, Set<String> set) {
        try {
            InputStream resourceAsStream = NgramValidator.class.getResourceAsStream(str);
            Throwable th = null;
            try {
                processInput(str, set, resourceAsStream);
                if (resourceAsStream != null) {
                    if (0 == 0) {
                        resourceAsStream.close();
                        return;
                    }
                    try {
                        resourceAsStream.close();
                    } catch (Throwable th2) {
                        th.addSuppressed(th2);
                    }
                }
            } finally {
            }
        } catch (IOException e) {
            LOG.error("Error loading data from file: {}", (Throwable) e);
        }
    }

    private void loadStringsFromResources(ResourceObject resourceObject, String str, Set<String> set) {
        try {
            ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(resourceObject.get(str));
            Throwable th = null;
            try {
                processInput(str, set, byteArrayInputStream);
                if (byteArrayInputStream != null) {
                    if (0 == 0) {
                        byteArrayInputStream.close();
                        return;
                    }
                    try {
                        byteArrayInputStream.close();
                    } catch (Throwable th2) {
                        th.addSuppressed(th2);
                    }
                }
            } finally {
            }
        } catch (IOException e) {
            LOG.error("Error loading data from file: {}", (Throwable) e);
        }
    }

    /* JADX WARN: Removed duplicated region for block: B:37:0x0031  */
    /*
        Code decompiled incorrectly, please refer to instructions dump.
        To view partially-correct add '--show-bad-code' argument
    */
    private void processInput(java.lang.String r7, java.util.Set<java.lang.String> r8, java.io.InputStream r9) throws java.io.IOException {
        /*
            r6 = this;
            if (r9 != 0) goto La
            org.slf4j.Logger r3 = sec.bdc.tm.hte.eu.ngram.extraction.NgramValidator.LOG
            java.lang.String r4 = "Error loading data from resource path: {}"
            r3.error(r4, r7)
        L9:
            return
        La:
            java.io.BufferedReader r1 = new java.io.BufferedReader
            java.io.InputStreamReader r3 = new java.io.InputStreamReader
            java.nio.charset.Charset r4 = sec.bdc.tm.hte.eu.Constants.CHARSET
            r3.<init>(r9, r4)
            r1.<init>(r3)
            r5 = 0
        L17:
            java.lang.String r0 = r1.readLine()     // Catch: java.lang.Throwable -> L2b java.lang.Throwable -> L51
            if (r0 == 0) goto L37
            java.lang.String r2 = r0.trim()     // Catch: java.lang.Throwable -> L2b java.lang.Throwable -> L51
            boolean r3 = r2.isEmpty()     // Catch: java.lang.Throwable -> L2b java.lang.Throwable -> L51
            if (r3 != 0) goto L17
            r8.add(r2)     // Catch: java.lang.Throwable -> L2b java.lang.Throwable -> L51
            goto L17
        L2b:
            r3 = move-exception
            throw r3     // Catch: java.lang.Throwable -> L2d
        L2d:
            r4 = move-exception
            r5 = r3
        L2f:
            if (r1 == 0) goto L36
            if (r5 == 0) goto L4d
            r1.close()     // Catch: java.lang.Throwable -> L48
        L36:
            throw r4
        L37:
            if (r1 == 0) goto L9
            if (r5 == 0) goto L44
            r1.close()     // Catch: java.lang.Throwable -> L3f
            goto L9
        L3f:
            r3 = move-exception
            r5.addSuppressed(r3)
            goto L9
        L44:
            r1.close()
            goto L9
        L48:
            r3 = move-exception
            r5.addSuppressed(r3)
            goto L36
        L4d:
            r1.close()
            goto L36
        L51:
            r3 = move-exception
            r4 = r3
            goto L2f
        */
        throw new UnsupportedOperationException("Method not decompiled: sec.bdc.tm.hte.eu.ngram.extraction.NgramValidator.processInput(java.lang.String, java.util.Set, java.io.InputStream):void");
    }

    public boolean hasUnpairedTokens(String str) {
        HashMultiset create = HashMultiset.create();
        for (String str2 : str.split("")) {
            if (create.contains(str2)) {
                create.remove(str2);
            } else if (PAIRED_CHARS.containsKey(str2)) {
                create.add(PAIRED_CHARS.get(str2));
            } else if (NON_STARTING_CHARS.contains(str2)) {
                return true;
            }
        }
        return !create.isEmpty();
    }

    public boolean isValidNgram(List<String> list) {
        return list.size() == 1 ? !isInvalidUnigram(list.get(0)) : isValidNonUnigram(list);
    }
}
