package sec.bdc.tm.hte.eu.ngram.extraction;

import com.google.common.collect.EvictingQueue;
import com.google.common.collect.HashMultiset;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Multiset;
import com.google.common.collect.Sets;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.function.Predicate;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import sec.bdc.nlp.Language;
import sec.bdc.tm.hte.eu.ngram.NgramException;
import sec.bdc.tm.hte.eu.ngram.interfaces.Extractor;
import sec.bdc.tm.hte.eu.ngram.settings.PropertyConstantsEnums;
import sec.bdc.tm.hte.eu.ngram.structures.Keyphrase;
import sec.bdc.tm.hte.eu.ngram.tokenizer.BnlpTokenizer;
import sec.bdc.tm.hte.eu.ngram.tokenizer.Tokenizer;
import sec.bdc.tm.hte.eu.ngram.tokenizer.Twokenizer;
import sec.bdc.tm.hte.eu.preprocessing.bnlp.BasicNLPAnalyzerEu;
import sec.bdc.tm.hte.eu.preprocessing.resource.ResourceObject;

/* loaded from: classes49.dex */
public class NgramFinder implements Extractor {
    private static final Logger LOG = LoggerFactory.getLogger((Class<?>) NgramFinder.class);
    private int maxNgramSize;
    private int minNgramCount;
    private final NgramValidator ngramValidator;
    private double subgramFactor;
    private Tokenizer tokenizer;

    public NgramFinder(int i, int i2, double d, Language language, PropertyConstantsEnums.TokenizerType tokenizerType, BasicNLPAnalyzerEu basicNLPAnalyzerEu) throws NgramException {
        this(i, i2, d, new NgramValidator(language), loadTokenizer(language, tokenizerType, basicNLPAnalyzerEu));
    }

    private NgramFinder(int i, int i2, double d, NgramValidator ngramValidator, Tokenizer tokenizer) {
        this.maxNgramSize = i;
        this.minNgramCount = i2;
        this.subgramFactor = d;
        this.ngramValidator = ngramValidator;
        this.tokenizer = tokenizer;
    }

    public NgramFinder(int i, int i2, double d, ResourceObject resourceObject, PropertyConstantsEnums.TokenizerType tokenizerType, BasicNLPAnalyzerEu basicNLPAnalyzerEu) throws NgramException {
        this(i, i2, d, new NgramValidator(resourceObject), loadTokenizer(resourceObject, tokenizerType, basicNLPAnalyzerEu));
    }

    public static List<List<String>> divideIntoSpans(List<String> list, Set<String> set) {
        ArrayList newArrayList = Lists.newArrayList();
        ArrayList newArrayList2 = Lists.newArrayList();
        for (String str : list) {
            if (set.contains(str)) {
                newArrayList2.add(str);
            } else if (!newArrayList2.isEmpty()) {
                newArrayList.add(newArrayList2);
                newArrayList2 = Lists.newArrayList();
            }
        }
        if (!newArrayList2.isEmpty()) {
            newArrayList.add(newArrayList2);
        }
        return newArrayList;
    }

    private Set<List<String>> extractDistinctNgramsFromSpan(List<String> list) {
        HashSet newHashSet = Sets.newHashSet();
        EvictingQueue create = EvictingQueue.create(this.maxNgramSize);
        Iterator<String> it = list.iterator();
        while (it.hasNext()) {
            create.add(it.next());
            for (int i = 0; i < create.size(); i++) {
                newHashSet.add(Lists.newArrayList(create).subList(i, create.size()));
            }
        }
        return newHashSet;
    }

    private Multiset<List<String>> extractFrequentNgrams(List<String> list, Set<String> set) throws NgramException {
        HashMultiset create = HashMultiset.create();
        Iterator<String> it = list.iterator();
        while (it.hasNext()) {
            List<String> list2 = this.tokenizer.tokenizeNormalize(it.next());
            HashSet newHashSet = Sets.newHashSet();
            Iterator<List<String>> it2 = divideIntoSpans(list2, set).iterator();
            while (it2.hasNext()) {
                newHashSet.addAll(extractDistinctNgramsFromSpan(it2.next()));
            }
            create.addAll(newHashSet);
        }
        trimInfrequentItems(create, this.minNgramCount);
        Subgrams.trimSubGrams(create, this.ngramValidator, this.subgramFactor);
        LOG.info("{} frequent ngrams.", Integer.valueOf(create.elementSet().size()));
        return create;
    }

    private Set<String> extractFrequentUnigrams(List<String> list) throws NgramException {
        HashMultiset create = HashMultiset.create();
        for (String str : list) {
            HashSet newHashSet = Sets.newHashSet();
            newHashSet.addAll(this.tokenizer.tokenizeNormalize(str));
            create.addAll(newHashSet);
        }
        trimInfrequentItems(create, this.minNgramCount);
        LOG.info("{} frequent unigrams.", Integer.valueOf(create.elementSet().size()));
        return create.elementSet();
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    public static final /* synthetic */ boolean lambda$trimInfrequentItems$0$NgramFinder(Multiset multiset, int i, Object obj) {
        return multiset.count(obj) < i;
    }

    private static Tokenizer loadTokenizer(Language language, PropertyConstantsEnums.TokenizerType tokenizerType, BasicNLPAnalyzerEu basicNLPAnalyzerEu) throws NgramException {
        if (PropertyConstantsEnums.TokenizerType.TWOKENIZER.equals(tokenizerType)) {
            LOG.info("Using Twokenizer tokenizer.");
            return new Twokenizer();
        }
        if (!PropertyConstantsEnums.TokenizerType.BNLP.equals(tokenizerType)) {
            throw new IllegalArgumentException("Unsupported tokenizer type: " + tokenizerType);
        }
        LOG.info("Using BNLP tokenizer.");
        return new BnlpTokenizer(language, basicNLPAnalyzerEu);
    }

    private static Tokenizer loadTokenizer(ResourceObject resourceObject, PropertyConstantsEnums.TokenizerType tokenizerType, BasicNLPAnalyzerEu basicNLPAnalyzerEu) throws NgramException {
        if (PropertyConstantsEnums.TokenizerType.TWOKENIZER.equals(tokenizerType)) {
            LOG.info("Using Twokenizer tokenizer.");
            return new Twokenizer();
        }
        if (!PropertyConstantsEnums.TokenizerType.BNLP.equals(tokenizerType)) {
            throw new IllegalArgumentException("Unsupported tokenizer type: " + tokenizerType);
        }
        LOG.info("Using BNLP tokenizer.");
        return new BnlpTokenizer(resourceObject, basicNLPAnalyzerEu);
    }

    private <E> void trimInfrequentItems(final Multiset<E> multiset, final int i) {
        int size = multiset.size();
        int size2 = multiset.elementSet().size();
        multiset.removeIf(new Predicate(multiset, i) { // from class: sec.bdc.tm.hte.eu.ngram.extraction.NgramFinder$$Lambda$0
            private final Multiset arg$1;
            private final int arg$2;

            /* JADX INFO: Access modifiers changed from: package-private */
            {
                this.arg$1 = multiset;
                this.arg$2 = i;
            }

            @Override // java.util.function.Predicate
            public boolean test(Object obj) {
                return NgramFinder.lambda$trimInfrequentItems$0$NgramFinder(this.arg$1, this.arg$2, obj);
            }
        });
        LOG.info("Trimmed {} occurrences (out of all {}) of {} (out of all {})", Integer.valueOf(size - multiset.size()), Integer.valueOf(size), Integer.valueOf(size2 - multiset.elementSet().size()), Integer.valueOf(size2));
    }

    @Override // sec.bdc.tm.hte.eu.ngram.interfaces.Extractor
    public Set<Keyphrase> extractPhrases(Stream<String> stream) throws NgramException {
        List<String> list = (List) stream.collect(Collectors.toList());
        Multiset<List<String>> extractFrequentNgrams = extractFrequentNgrams(list, extractFrequentUnigrams(list));
        HashMap newHashMap = Maps.newHashMap();
        for (List<String> list2 : extractFrequentNgrams.elementSet()) {
            String join = String.join(StringUtils.SPACE, list2);
            if (newHashMap.containsKey(join)) {
                ((Keyphrase) newHashMap.get(join)).increaseCount(extractFrequentNgrams.count(list2));
            } else {
                newHashMap.put(join, new Keyphrase(join, extractFrequentNgrams.count(list2)));
            }
        }
        return Sets.newHashSet(newHashMap.values());
    }
}
