package sec.bdc.tm.hte.eu.preprocessing.bnlp.postagging;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;
import org.apache.commons.lang3.tuple.Pair;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import sec.bdc.nlp.ds.Token;
import sec.bdc.tm.hte.eu.preprocessing.bnlp.Tags;
import sec.bdc.tm.hte.eu.preprocessing.bnlp.morph.Interpretation;
import sec.bdc.tm.hte.eu.preprocessing.bnlp.morph.InterpretedWord;
import sec.bdc.tm.hte.eu.preprocessing.bnlp.segment.SpecialPatterns;
import sec.bdc.tm.hte.eu.preprocessing.bnlp.segment.SpecialPatternsMatcher;

/* loaded from: classes49.dex */
public class PosTaggerImpl implements PosTagger {
    private static final Logger LOG = LoggerFactory.getLogger((Class<?>) PosTagger.class);
    private static final List<Pair<Pattern, String>> patternToTagRules = new ArrayList();
    private final Map<String, String> tokenPosTagMap;

    static {
        patternToTagRules.add(Pair.of(SpecialPatterns.DECIMAL_NUMBER_PATTERN, "num"));
        patternToTagRules.add(Pair.of(SpecialPatterns.EMAIL_NOSPLIT_PATTERN, "spec:email"));
        patternToTagRules.add(Pair.of(SpecialPatterns.URL_NOSPLIT_PATTERN, "spec:url"));
        patternToTagRules.add(Pair.of(SpecialPatterns.URL_NOSPLIT_PATTERN_WITH_DOTS, "spec:url"));
        patternToTagRules.add(Pair.of(SpecialPatterns.HASHTAG_NOSPLIT_PATTERN, "spec:hashtag"));
        patternToTagRules.add(Pair.of(SpecialPatterns.MENTION_NOSPLIT_PATTERN, "spec:user"));
        patternToTagRules.add(Pair.of(SpecialPatterns.PRICE_SPLIT_PATTERN, "num:curr"));
        patternToTagRules.add(Pair.of(SpecialPatterns.GRADES_SPLIT_PATTERN, "num:unit"));
        patternToTagRules.add(Pair.of(SpecialPatterns.DISPLAY_SIZE_SPLIT_PATTERN, "num:unit"));
        patternToTagRules.add(Pair.of(SpecialPatterns.VERSION_NUMBER_NOSPLIT_PATTERN, "num"));
        patternToTagRules.add(Pair.of(SpecialPatterns.HTML_ENCODING_NOSPLIT_PATTERN, "xxx"));
        patternToTagRules.add(Pair.of(SpecialPatterns.DOT_TERMINATED_ABBREVIATIONS_PATTERN, "abbr"));
        patternToTagRules.add(Pair.of(SpecialPatterns.NUMBER_WITH_PUNCT_PATTERN, "num"));
        patternToTagRules.add(Pair.of(Pattern.compile("&.*"), "xxx"));
    }

    public PosTaggerImpl(Map<String, String> map) {
        this.tokenPosTagMap = new HashMap(map != null ? map : Collections.emptyMap());
        if (map == null) {
            LOG.warn("WARNING: tokenPosTagMap in POS tagger is null");
        }
    }

    private void fixPredictedPosTag(InterpretedWord interpretedWord) {
        String rawText = interpretedWord.getWord().getRawText();
        if (rawText.endsWith(".") && Character.isUpperCase(rawText.codePointAt(0))) {
            interpretedWord.setPredictedPosTag("subst");
            interpretedWord.getWord().getToken(0).setMpTags("prn");
            return;
        }
        if (this.tokenPosTagMap.containsKey(rawText)) {
            interpretedWord.setPredictedPosTag(this.tokenPosTagMap.get(rawText));
            return;
        }
        SpecialPatternsMatcher specialPatternsMatcher = new SpecialPatternsMatcher(rawText);
        if (specialPatternsMatcher.isEmoticon()) {
            interpretedWord.setPredictedPosTag("spec");
            interpretedWord.getWord().getToken(0).setPosTag("spec");
            interpretedWord.getWord().getToken(0).setMpTags("emoticon");
            return;
        }
        if (specialPatternsMatcher.isParsableFloatValue()) {
            interpretedWord.setPredictedPosTag("num");
            interpretedWord.updateTokenByPredictedPosTag();
            return;
        }
        if (specialPatternsMatcher.isWordContainsOnlySpecialUTF8Signs()) {
            interpretedWord.setPredictedPosTag(Tags.INTERP_TAG);
            interpretedWord.updateTokenByPredictedPosTag();
        }
        for (Pair<Pattern, String> pair : patternToTagRules) {
            if (pair.getLeft().matcher(rawText).matches()) {
                Interpretation interpretation = new Interpretation(pair.getRight());
                interpretedWord.setPredictedPosTag(interpretation.getPos());
                for (Token token : interpretedWord.getWord().getTokenList()) {
                    token.setPosTag(interpretation.getPos());
                    token.setMpTags(interpretation.getMorphoTags());
                }
                return;
            }
        }
        if (specialPatternsMatcher.isPunct()) {
            interpretedWord.setPredictedPosTag(Tags.INTERP_TAG);
        }
    }

    @Override // sec.bdc.tm.hte.eu.preprocessing.bnlp.postagging.PosTagger
    public void doTagging(List<InterpretedWord> list) {
        for (int i = 0; i < list.size(); i++) {
            List<Interpretation> interpretations = list.get(i).getInterpretations();
            list.get(i).setPredictedPosTag(!interpretations.isEmpty() ? interpretations.get(0).getPos() : "unk");
        }
        for (InterpretedWord interpretedWord : list) {
            interpretedWord.updateTokenByPredictedPosTag();
            fixPredictedPosTag(interpretedWord);
        }
    }
}
