package sec.bdc.tm.hte.eu.preprocessing.bnlp.sentsplit;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.regex.Matcher;
import sec.bdc.nlp.ds.Sentence;
import sec.bdc.nlp.ds.Token;
import sec.bdc.tm.hte.eu.preprocessing.bnlp.normalize.NormalizedText;
import sec.bdc.tm.hte.eu.preprocessing.bnlp.normalize.TextNormalizerFactory;
import sec.bdc.tm.hte.eu.preprocessing.bnlp.segment.SpecialPatterns;
import sec.bdc.tm.hte.eu.preprocessing.bnlp.segment.SpecialPatternsMatcher;

/* JADX INFO: Access modifiers changed from: package-private */
/* loaded from: classes49.dex */
public class SentenceSplitterImpl implements SentenceSplitter {
    private final String punctuation;
    private final List<SrxRule> rules;

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: classes49.dex */
    public class StandardSentenceSplitter {
        private final NormalizedText normalizedText;
        private final TokenStack tokenStack;

        private StandardSentenceSplitter(NormalizedText normalizedText) {
            this.normalizedText = normalizedText;
            this.tokenStack = new TokenStack(normalizedText.getNormalizedTokenList());
        }

        private boolean endsWithEndingSymbol(char c) {
            return ".!?".indexOf(c) != -1;
        }

        private boolean endsWithInterpunctionSigns(Token token) {
            return SentenceSplitterImpl.this.punctuation.indexOf(token.getRawText().charAt(token.length() + (-1))) != -1 && token.length() > 1;
        }

        /* JADX INFO: Access modifiers changed from: private */
        public boolean hasNext() {
            return this.tokenStack.hasNext();
        }

        private int indexOfDoubleQuote(String str) {
            int indexOf = str.indexOf("''");
            return indexOf > -1 ? indexOf : str.indexOf("``");
        }

        private int indexOfEndingSymbol(String str) {
            Matcher matcher = SentenceSplitterPatterns.POSSIBLE_SENTENCE_ENDING_PATTERN.matcher(str);
            if (matcher.find()) {
                return matcher.start();
            }
            return -1;
        }

        private boolean isAbbreviation(String str) {
            if (isSingleLetterAbbr(str) || SentenceSplitterPatterns.Xx_DOT_ABBR_PATTERN.matcher(str).matches()) {
                return true;
            }
            return SentenceSplitterPatterns.KNOWN_ABBRS.contains(str.toUpperCase());
        }

        private boolean isAlpha(String str) {
            return SentenceSplitterPatterns.ALPHA.matcher(str).matches();
        }

        private boolean isDoubleQuote(String str) {
            return str.equals("''") || str.equals("``");
        }

        private boolean isEndingSymbolsBetweenAlphas(List<Token> list) {
            Token token = list.get(0);
            Token token2 = list.get(2);
            return isAlpha(token.getRawText()) && token.length() > 1 && isAlpha(token2.getRawText()) && startsWithUpperCase(token2.getRawText());
        }

        private boolean isNormalToken(String str) {
            SpecialPatternsMatcher specialPatternsMatcher = new SpecialPatternsMatcher(str);
            return isAlpha(str) || isDoubleQuote(str) || isPartPoss(str) || isAbbreviation(str) || specialPatternsMatcher.isEmoticon() || specialPatternsMatcher.isDisplaySize() || specialPatternsMatcher.isHTMLEncoding() || isYearAbbreviation(str);
        }

        private boolean isPartPoss(String str) {
            return str.charAt(0) == '\'' && str.length() == 2;
        }

        private boolean isQuotationOrComaBegining(String str) {
            return ("'\"([{<".indexOf(str.charAt(0)) == -1 || str.length() <= 1 || isDoubleQuote(str) || isYearAbbreviation(str) || isPartPoss(str)) ? false : true;
        }

        private boolean isSentenceSplitPoint(List<Token> list, Token token) {
            return (!this.tokenStack.hasNext() || startsWithUpperCase(this.tokenStack.peek().getRawText())) && !isSrxBreakingException(list, token);
        }

        private boolean isSingleLetterAbbr(String str) {
            return str.length() == 2 && Character.isUpperCase(str.charAt(0)) && str.charAt(1) == '.';
        }

        private boolean isSrxBreakingException(List<Token> list, Token token) {
            if (SentenceSplitterImpl.this.rules == null || SentenceSplitterImpl.this.rules.isEmpty()) {
                return false;
            }
            int position = list.get(0).getPosition();
            String subNormalizedText = this.tokenStack.size() > 2 ? this.normalizedText.subNormalizedText(position, this.tokenStack.peekThrice().endPosition()) : this.normalizedText.subNormalizedText(position);
            int endPosition = (token.endPosition() - 1) - position;
            for (SrxRule srxRule : SentenceSplitterImpl.this.rules) {
                if (!srxRule.isBreaking() && srxRule.matches(subNormalizedText, endPosition)) {
                    return true;
                }
            }
            return false;
        }

        private boolean isYearAbbreviation(String str) {
            return SentenceSplitterPatterns.YEAR.matcher(str).matches();
        }

        /* JADX INFO: Access modifiers changed from: private */
        public List<Token> next() {
            ArrayList arrayList = new ArrayList();
            while (this.tokenStack.hasNext()) {
                Token pop = this.tokenStack.pop();
                String rawText = pop.getRawText();
                if (!rawText.isEmpty()) {
                    char charAt = rawText.charAt(pop.length() - 1);
                    Matcher matcher = SpecialPatterns.ELIPSIS_AT_END.matcher(rawText);
                    List<Token> list = tokenSplitCheck(pop, arrayList);
                    if (isNormalToken(rawText)) {
                        arrayList.add(pop);
                    } else if (!list.isEmpty()) {
                        this.tokenStack.push(list);
                    } else if (matcher.find()) {
                        arrayList.addAll(split(pop, pop.length() - matcher.group(1).length()));
                        if (isSentenceSplitPoint(arrayList, pop)) {
                            break;
                        }
                    } else if (endsWithEndingSymbol(charAt)) {
                        arrayList.addAll(split(pop, pop.length() - 1));
                        if (!isSrxBreakingException(arrayList, pop)) {
                            break;
                        }
                    } else {
                        arrayList.add(pop);
                    }
                }
            }
            return arrayList;
        }

        private List<Token> split(Token token, int... iArr) {
            ArrayList arrayList = new ArrayList(iArr.length);
            int i = 0;
            for (int i2 = 0; i2 < iArr.length && i < token.length(); i2++) {
                if (iArr[i2] > i) {
                    arrayList.add(new Token(token.getPosition() + i, token.subRawText(i, iArr[i2])));
                }
                i = iArr[i2];
            }
            if (i < token.length()) {
                arrayList.add(new Token(token.getPosition() + i, token.subRawText(i, token.length())));
            }
            return arrayList;
        }

        private boolean startsWithUpperCase(String str) {
            return Character.isUpperCase(str.charAt(0));
        }

        private List<Token> tokenSplitCheck(Token token, List<Token> list) {
            String rawText = token.getRawText();
            char charAt = rawText.charAt(token.length() - 1);
            int indexOfDoubleQuote = indexOfDoubleQuote(rawText);
            if (indexOfDoubleQuote > 0) {
                return split(token, indexOfDoubleQuote, indexOfDoubleQuote + 2);
            }
            if (isQuotationOrComaBegining(rawText)) {
                List<Token> split = split(token, 1);
                list.add(split.get(0));
                return Collections.singletonList(split.get(1));
            }
            if (endsWithInterpunctionSigns(token)) {
                return split(token, token.length() - 1);
            }
            int indexOfEndingSymbol = indexOfEndingSymbol(rawText);
            Matcher matcher = SpecialPatterns.ELIPSIS_AT_END.matcher(rawText);
            if (indexOfEndingSymbol > 0 && !endsWithEndingSymbol(charAt) && !matcher.find()) {
                List<Token> split2 = split(token, indexOfEndingSymbol, indexOfEndingSymbol + 1);
                if (isEndingSymbolsBetweenAlphas(split2)) {
                    return split2;
                }
            }
            return new ArrayList();
        }
    }

    public SentenceSplitterImpl(List<SrxRule> list, String str) {
        this.rules = list;
        this.punctuation = str;
    }

    @Override // sec.bdc.tm.hte.eu.preprocessing.bnlp.sentsplit.SentenceSplitter
    public List<Sentence> split(String str) {
        return str.isEmpty() ? new ArrayList() : split(TextNormalizerFactory.create().normalize(str));
    }

    @Override // sec.bdc.tm.hte.eu.preprocessing.bnlp.sentsplit.SentenceSplitter
    public List<Sentence> split(NormalizedText normalizedText) {
        ArrayList arrayList = new ArrayList();
        StandardSentenceSplitter standardSentenceSplitter = new StandardSentenceSplitter(normalizedText);
        while (standardSentenceSplitter.hasNext()) {
            List<Token> next = standardSentenceSplitter.next();
            if (!next.isEmpty()) {
                arrayList.add(normalizedText.createSentence(1, next));
            }
        }
        return arrayList;
    }
}
