package sec.bdc.tm.hte.eu.preprocessing.bnlp.segment;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import sec.bdc.nlp.ds.Sentence;
import sec.bdc.nlp.ds.Token;
import sec.bdc.nlp.ds.Word;
import sec.bdc.tm.hte.eu.preprocessing.bnlp.tagsplit.TagSplitter;
import sec.bdc.tm.hte.eu.preprocessing.bnlp.utils.DsUtils;
import sec.bdc.tm.hte.eu.preprocessing.bnlp.utils.SeparatedStringBuilder;

/* JADX INFO: Access modifiers changed from: package-private */
/* loaded from: classes49.dex */
public class CasesBasedSegmenter implements Segmenter {
    private static final String INTERP_TAG = "interp";
    private Map<String, String[]> prefixes;
    private int[] prefixesLengthsDescending;
    private Map<String, String[]> suffixes;
    private int[] suffixesLengthsDescending;
    private final TagSplitter tagSplitter;
    private List<String> wordInfixes;
    private Pattern wordInfixesSplitPattern;
    private String wordInfixesSplitRegex;
    private Set<String> wordsBelongingPunctuation = new HashSet();
    private static final Pattern LETTER_PATTERN = Pattern.compile("\\p{L}|\\p{Digit}");
    private static final String[] PUNCTUATION_IN_NONSPLITABLE_SEQUENCES = {"'", ".", "?", "!", "\"", "\\-"};
    private static final String PUNCTUATION_USED_IN_SPLITTING_REGEX = "#()*+,/:;<=>@^`{|}~\\[\\]";
    private static final String PUNCTUATION_CHAR_REGEX = String.format("[%s]+|[%s]", SeparatedStringBuilder.build("]+|[", PUNCTUATION_IN_NONSPLITABLE_SEQUENCES), PUNCTUATION_USED_IN_SPLITTING_REGEX);
    private static final Pattern PUNCTUATION_CHAR_PATTERN = Pattern.compile(PUNCTUATION_CHAR_REGEX);

    public CasesBasedSegmenter(CaseBasedSegmenterData caseBasedSegmenterData, TagSplitter tagSplitter) {
        this.prefixes = caseBasedSegmenterData.getPrefixes();
        if (!this.prefixes.isEmpty()) {
            addWordsBelongingPunctuation(this.prefixes.keySet());
            this.prefixesLengthsDescending = stringsLengthsDescending(this.prefixes.keySet());
        }
        this.suffixes = caseBasedSegmenterData.getSuffixes();
        if (!this.suffixes.isEmpty()) {
            addWordsBelongingPunctuation(this.suffixes.keySet());
            this.suffixesLengthsDescending = stringsLengthsDescending(this.suffixes.keySet());
        }
        this.wordInfixes = caseBasedSegmenterData.getWordInfixes();
        if (!this.wordInfixes.isEmpty()) {
            addWordsBelongingPunctuation(this.wordInfixes);
            if (this.wordInfixes.size() == 1) {
                this.wordInfixesSplitRegex = this.wordInfixes.get(0);
            } else {
                this.wordInfixesSplitRegex = "(" + SeparatedStringBuilder.build(")|(", this.wordInfixes) + ")";
            }
            this.wordInfixesSplitPattern = Pattern.compile(this.wordInfixesSplitRegex);
        }
        this.tagSplitter = tagSplitter;
    }

    private static void addSingleTokenWordIfNotEmpty(String str, int i, int i2, int i3, boolean z, Collection<Word> collection) {
        if (i < i2) {
            Word word = new Word(new Token(i3, str.substring(i, i2)));
            if (z) {
                word.getTokenList().get(0).setPosTag("interp");
            }
            collection.add(word);
        }
    }

    private void addSubstrIfNotEmpty(String str, int i, int i2, Collection<String> collection) {
        if (i < i2) {
            collection.add(str.substring(i, i2));
        }
    }

    private void addWordsBelongingPunctuation(Iterable<String> iterable) {
        for (String str : iterable) {
            for (int i = 0; i < str.length(); i++) {
                String substring = str.substring(i, i + 1);
                if (new SpecialPatternsMatcher(substring).isPunct()) {
                    this.wordsBelongingPunctuation.add(substring);
                }
            }
        }
    }

    private Word createWordFromArray(String str, int i, String[] strArr, boolean z) {
        Word word = new Word(i, str);
        word.setTokenList(new ArrayList());
        int i2 = i;
        int i3 = 0;
        for (String str2 : strArr) {
            if (z) {
                int length = i3 + str2.length();
                word.addToken(new Token(i2, str.substring(i3, length)));
                i3 = length;
            } else {
                word.addToken(new Token(i2, str2));
            }
            i2 += str2.length();
        }
        return word;
    }

    private int firstLetterPosition(CharSequence charSequence) {
        Matcher matcher = LETTER_PATTERN.matcher(charSequence);
        if (matcher.find()) {
            return matcher.start();
        }
        return -1;
    }

    private int indexOfFirstShorterString(CharSequence charSequence, int[] iArr) {
        for (int i = 0; i < iArr.length; i++) {
            if (charSequence.length() > iArr[i]) {
                return i;
            }
        }
        return -1;
    }

    private int[] iterableToListOfInteger(List<Integer> list) {
        int[] iArr = new int[list.size()];
        for (int i = 0; i < list.size(); i++) {
            iArr[i] = list.get(i).intValue();
        }
        return iArr;
    }

    private int lastLetterPosition(CharSequence charSequence) {
        Matcher matcher = LETTER_PATTERN.matcher(charSequence);
        int i = -1;
        while (matcher.find()) {
            i = matcher.start();
        }
        return i;
    }

    private List<Word> splitOnPattern(Word word, Set<String> set, Pattern pattern, Map<String, String[]> map, Map<String, String[]> map2) {
        String rawText = word.getRawText();
        if (map.keySet().contains(rawText.toLowerCase())) {
            return Arrays.asList(word);
        }
        if (map2.keySet().contains(rawText)) {
            return Arrays.asList(word);
        }
        ArrayList arrayList = new ArrayList();
        Matcher matcher = pattern.matcher(rawText);
        int position = word.getPosition();
        int firstLetterPosition = firstLetterPosition(rawText);
        int lastLetterPosition = lastLetterPosition(rawText);
        int i = 0;
        while (matcher.find()) {
            int start = matcher.start();
            if (set == null || start <= firstLetterPosition || start >= lastLetterPosition || !set.contains(matcher.group())) {
                addSingleTokenWordIfNotEmpty(rawText, i, start, position + i, false, arrayList);
                if (matcher.group() != null) {
                    addSingleTokenWordIfNotEmpty(rawText, start, matcher.end(), position + start, true, arrayList);
                }
                i = matcher.end();
            }
        }
        if (i >= rawText.length()) {
            return arrayList;
        }
        if (i == 0) {
            arrayList.add(word);
            return arrayList;
        }
        addSingleTokenWordIfNotEmpty(rawText, i, rawText.length(), position + i, false, arrayList);
        return arrayList;
    }

    private List<Word> splitOnPunctuation(Word word, Set<String> set, Map<String, String[]> map, Map<String, String[]> map2) {
        return splitOnPattern(word, set, PUNCTUATION_CHAR_PATTERN, map, map2);
    }

    private List<String> splitPreservingDelimiters(String str, Pattern pattern) {
        ArrayList arrayList = new ArrayList();
        int i = 0;
        Matcher matcher = pattern.matcher(str);
        while (matcher.find()) {
            addSubstrIfNotEmpty(str, i, matcher.start(), arrayList);
            addSubstrIfNotEmpty(str, matcher.start(), matcher.end(), arrayList);
            i = matcher.end();
        }
        if (i < str.length()) {
            addSubstrIfNotEmpty(str, i, str.length(), arrayList);
        }
        return arrayList;
    }

    private List<Word> splitWordWithPrefix(Word word, String str, String[] strArr) {
        ArrayList arrayList = new ArrayList(2);
        arrayList.add(createWordFromArray(str, word.getPosition(), strArr, true));
        arrayList.add(new Word(new Token(word.getPosition() + str.length(), word.getRawText().substring(str.length()))));
        return arrayList;
    }

    private List<Word> splitWordWithSuffix(Word word, String str, String[] strArr) {
        ArrayList arrayList = new ArrayList(2);
        String rawText = word.getRawText();
        int length = rawText.length() - str.length();
        arrayList.add(new Word(new Token(word.getPosition(), rawText.substring(0, length))));
        arrayList.add(createWordFromArray(str, word.getPosition() + length, strArr, true));
        return arrayList;
    }

    private int[] stringsLengthsDescending(Iterable<String> iterable) {
        HashSet hashSet = new HashSet();
        Iterator<String> it = iterable.iterator();
        while (it.hasNext()) {
            hashSet.add(Integer.valueOf(it.next().length()));
        }
        ArrayList arrayList = new ArrayList();
        Iterator it2 = hashSet.iterator();
        while (it2.hasNext()) {
            arrayList.add(Integer.valueOf(((Integer) it2.next()).intValue()));
        }
        Collections.sort(arrayList);
        Collections.reverse(arrayList);
        return iterableToListOfInteger(arrayList);
    }

    private void trySplitOnAffixes(Collection<Word> collection, Word word) {
        List<Word> trySplitOnPrefix = trySplitOnPrefix(word);
        if (trySplitOnPrefix.isEmpty()) {
            trySplitOnInfixAndSuffix(word, collection);
            return;
        }
        boolean z = true;
        for (Word word2 : trySplitOnPrefix) {
            if (z) {
                collection.add(word2);
                z = false;
            } else {
                trySplitOnInfixAndSuffix(word2, collection);
            }
        }
    }

    private boolean trySplitOnInfix(Word word) {
        if (this.wordInfixes.isEmpty()) {
            return false;
        }
        List<String> splitPreservingDelimiters = splitPreservingDelimiters(word.getRawText(), this.wordInfixesSplitPattern);
        if (splitPreservingDelimiters.size() == 1) {
            return false;
        }
        word.getTokenList().clear();
        int position = word.getPosition();
        for (String str : splitPreservingDelimiters) {
            word.addToken(new Token(position, str));
            position += str.length();
        }
        return true;
    }

    private void trySplitOnInfixAndSuffix(Word word, Collection<Word> collection) {
        List<Word> trySplitOnSuffix = trySplitOnSuffix(word);
        if (trySplitOnSuffix.isEmpty()) {
            trySplitOnInfix(word);
            collection.add(word);
        } else {
            trySplitOnInfix(trySplitOnSuffix.get(0));
            collection.addAll(trySplitOnSuffix);
        }
    }

    private List<Word> trySplitOnPrefix(Word word) {
        int indexOfFirstShorterString;
        if (!this.prefixes.isEmpty() && (indexOfFirstShorterString = indexOfFirstShorterString(word.getRawText(), this.prefixesLengthsDescending)) != -1) {
            String substring = word.getRawText().substring(0, this.prefixesLengthsDescending[indexOfFirstShorterString]);
            for (int i = indexOfFirstShorterString + 1; i < this.prefixesLengthsDescending.length; i++) {
                if (this.prefixes.containsKey(substring.toLowerCase())) {
                    return splitWordWithPrefix(word, substring, this.prefixes.get(substring.toLowerCase()));
                }
                substring = substring.substring(0, this.prefixesLengthsDescending[i]);
            }
            return this.prefixes.containsKey(substring.toLowerCase()) ? splitWordWithPrefix(word, substring, this.prefixes.get(substring.toLowerCase())) : Collections.emptyList();
        }
        return Collections.emptyList();
    }

    private List<Word> trySplitOnSuffix(Word word) {
        int indexOfFirstShorterString;
        if (!this.suffixes.isEmpty() && (indexOfFirstShorterString = indexOfFirstShorterString(word.getRawText(), this.suffixesLengthsDescending)) != -1) {
            String rawText = word.getRawText();
            int i = indexOfFirstShorterString + 1;
            String substring = rawText.substring(rawText.length() - this.suffixesLengthsDescending[indexOfFirstShorterString]);
            if (this.suffixes.containsKey(substring.toLowerCase())) {
                return splitWordWithSuffix(word, substring, this.suffixes.get(substring.toLowerCase()));
            }
            for (int i2 = i; i2 < this.suffixesLengthsDescending.length; i2++) {
                substring = substring.substring(this.suffixesLengthsDescending[i2 - 1] - this.suffixesLengthsDescending[i2]);
                if (this.suffixes.containsKey(substring.toLowerCase())) {
                    return splitWordWithSuffix(word, substring, this.suffixes.get(substring.toLowerCase()));
                }
            }
            return this.suffixes.containsKey(substring.toLowerCase()) ? splitWordWithSuffix(word, substring, this.suffixes.get(substring.toLowerCase())) : Collections.emptyList();
        }
        return Collections.emptyList();
    }

    protected boolean isNonSpecialCase(SpecialPatternsMatcher specialPatternsMatcher) {
        return specialPatternsMatcher.isURLWithDots() || specialPatternsMatcher.isEmoticon() || specialPatternsMatcher.isEmailAdress() || specialPatternsMatcher.isURL() || specialPatternsMatcher.isVersionNumber() || specialPatternsMatcher.isHTMLEncoding();
    }

    @Override // sec.bdc.tm.hte.eu.preprocessing.bnlp.segment.Segmenter
    public void segment(Sentence sentence) {
        List<Word> wordList = sentence.getWordList();
        ArrayList arrayList = new ArrayList(wordList.size());
        int size = wordList.size() - 1;
        boolean z = false;
        for (int i = 0; i < wordList.size(); i++) {
            Word word = wordList.get(i);
            if (i == size) {
                z = true;
            }
            if (!tryProcessSpecialCases(word, z, arrayList)) {
                for (Word word2 : splitOnPunctuation(word, this.wordsBelongingPunctuation, this.prefixes, this.suffixes)) {
                    if (DsUtils.getMeaningfulToken(word2).getPosTag() == null) {
                        trySplitOnAffixes(arrayList, word2);
                    } else {
                        arrayList.add(word2);
                    }
                }
            }
        }
        sentence.setWordList(arrayList);
        if (this.tagSplitter != null) {
            this.tagSplitter.fixSentence(sentence);
        }
        for (int i2 = 0; i2 < sentence.getWordList().size(); i2++) {
            sentence.getWordList().get(i2).setIndex(i2 + 1);
        }
    }

    protected Collection<Word> splitWithRegexNonSeparateWords(int i, Matcher matcher, Word word) {
        ArrayList arrayList = new ArrayList();
        int i2 = 0;
        int position = word.getPosition();
        while (matcher.find()) {
            int groupCount = matcher.groupCount() + 1;
            ArrayList arrayList2 = new ArrayList(i);
            Word word2 = new Word(new Token(matcher.start(1) + position, matcher.group(1)));
            for (int i3 = 2; i3 < groupCount; i3++) {
                if (matcher.group(i3) != null) {
                    arrayList2.add(new Token(matcher.start(i3) + position, matcher.group(i3)));
                    i2 = matcher.end(i3);
                }
            }
            word2.setTokenList(arrayList2);
            arrayList.add(word2);
        }
        if (i2 < word.length()) {
            arrayList.add(new Word(new Token(position + i2, word.getRawText().substring(i2))));
        }
        return arrayList;
    }

    protected Collection<Word> splitWithRegexSeparateWords(int i, Matcher matcher, Word word) {
        ArrayList arrayList = new ArrayList();
        int i2 = 0;
        int position = word.getPosition();
        while (matcher.find()) {
            int i3 = 2 + i;
            for (int i4 = 2; i4 < i3; i4++) {
                if (matcher.group(i4) != null) {
                    arrayList.add(new Word(new Token(matcher.start(i4) + position, matcher.group(i4))));
                    i2 = matcher.end(i4);
                }
            }
        }
        if (i2 < word.length()) {
            arrayList.add(new Word(new Token(position + i2, word.getRawText().substring(i2))));
        }
        return arrayList;
    }

    protected Collection<Word> splitWithRegexWordIntoNTokens(Word word, Pattern pattern, int i, boolean z) {
        Matcher matcher = pattern.matcher(word.getRawText());
        return z ? splitWithRegexSeparateWords(i, matcher, word) : splitWithRegexNonSeparateWords(i, matcher, word);
    }

    protected List<Word> splitWordWithSuffix(Word word, String str) {
        return splitWordWithSuffix(word, str, new String[]{str});
    }

    protected boolean tryProcessSpecialCases(Word word, boolean z, Collection<Word> collection) {
        String rawText = word.getRawText();
        SpecialPatternsMatcher specialPatternsMatcher = new SpecialPatternsMatcher(rawText);
        if (isNonSpecialCase(specialPatternsMatcher)) {
            return collection.add(word);
        }
        if (specialPatternsMatcher.isURLwithTextAtBegin()) {
            collection.addAll(splitWithRegexWordIntoNTokens(word, SpecialPatterns.URL_WITH_TEXT_AT_BEGIN_PATTERN, 2, true));
            return true;
        }
        if (specialPatternsMatcher.isURLwithHashTagOrUserMention()) {
            collection.addAll(splitWithRegexWordIntoNTokens(word, SpecialPatterns.URL_WITH_HASHTAG_OR_USERMENTION_AT_END_PATTERN, 2, true));
            return true;
        }
        if (z && specialPatternsMatcher.isAbbrevation()) {
            collection.addAll(splitWordWithSuffix(word, "."));
            return true;
        }
        if (specialPatternsMatcher.isDisplaySize()) {
            return collection.addAll(splitWithRegexWordIntoNTokens(word, SpecialPatterns.DISPLAY_SIZE_SPLIT_PATTERN, 2, false));
        }
        if (specialPatternsMatcher.isPercentage()) {
            collection.addAll(splitWithRegexWordIntoNTokens(word, SpecialPatterns.NUMBER_WITH_PERCENT_SPLIT_PATTERN, 2, false));
            return true;
        }
        if (specialPatternsMatcher.isPrice()) {
            collection.addAll(splitWithRegexWordIntoNTokens(word, SpecialPatterns.PRICE_SPLIT_PATTERN, 2, false));
            return true;
        }
        if (specialPatternsMatcher.containsGrades()) {
            collection.addAll(splitWithRegexWordIntoNTokens(word, SpecialPatterns.GRADES_SPLIT_PATTERN, 0, false));
            return true;
        }
        if (specialPatternsMatcher.isUserMention()) {
            collection.addAll(splitWithRegexWordIntoNTokens(word, SpecialPatterns.MENTION_NOSPLIT_PATTERN, 2, false));
            return true;
        }
        if (specialPatternsMatcher.isHashTag()) {
            collection.addAll(splitWithRegexWordIntoNTokens(word, SpecialPatterns.HASHTAG_NOSPLIT_PATTERN, 2, false));
            return true;
        }
        if (specialPatternsMatcher.isWordContainsOnlySpecialUTF8Signs() || specialPatternsMatcher.howManySpecialUTF8SignsAtTheEnd() <= 0) {
            if (!specialPatternsMatcher.isNonSplittable()) {
                return false;
            }
            collection.add(word);
            return true;
        }
        String substring = rawText.substring(0, rawText.length() - specialPatternsMatcher.howManySpecialUTF8SignsAtTheEnd());
        collection.addAll(Arrays.asList(new Word(new Token(word.getPosition(), substring)), new Word(new Token(word.getPosition() + substring.length(), rawText.substring(rawText.length() - specialPatternsMatcher.howManySpecialUTF8SignsAtTheEnd())))));
        return true;
    }
}
