package sec.bdc.tm.hte.eu.ngram;

import java.util.Set;
import java.util.function.Function;
import java.util.stream.Stream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import sec.bdc.nlp.Language;
import sec.bdc.tm.hte.eu.ngram.clustering.PhraseClusterer;
import sec.bdc.tm.hte.eu.ngram.extraction.NgramFinder;
import sec.bdc.tm.hte.eu.ngram.interfaces.Extractor;
import sec.bdc.tm.hte.eu.ngram.preprocessing.SourcePreprocessor;
import sec.bdc.tm.hte.eu.ngram.preprocessing.TwitterPreprocessor;
import sec.bdc.tm.hte.eu.ngram.settings.NgramExtractorSettings;
import sec.bdc.tm.hte.eu.ngram.settings.PropertyConstantsEnums;
import sec.bdc.tm.hte.eu.ngram.settings.general.PropertyConstantsEnums;
import sec.bdc.tm.hte.eu.ngram.structures.Document;
import sec.bdc.tm.hte.eu.ngram.structures.Keyphrase;
import sec.bdc.tm.hte.eu.preprocessing.bnlp.BasicNLPAnalyzerEu;
import sec.bdc.tm.hte.eu.preprocessing.resource.ResourceObject;

/* JADX INFO: Access modifiers changed from: package-private */
/* loaded from: classes49.dex */
public class CorpusPhraseExtractor {
    private static final Logger LOG = LoggerFactory.getLogger((Class<?>) CorpusPhraseExtractor.class);
    private final Extractor extractor;
    private final PhraseClusterer phraseClusterer;
    private final SourcePreprocessor preprocessor;

    public CorpusPhraseExtractor(NgramExtractorSettings ngramExtractorSettings, BasicNLPAnalyzerEu basicNLPAnalyzerEu) throws NgramException {
        this.extractor = loadExtractor(ngramExtractorSettings.getExtractorType(), ngramExtractorSettings.getNgramMaxSize(), ngramExtractorSettings.getNgramMinCount(), ngramExtractorSettings.getNgramSubgramFactor(), ngramExtractorSettings.getInputLanguage(), ngramExtractorSettings.getNgramTokenizerType(), basicNLPAnalyzerEu);
        this.preprocessor = loadSourcePreprocessor(ngramExtractorSettings.getInputSourceType());
        this.phraseClusterer = new PhraseClusterer(ngramExtractorSettings.getInputLanguage(), ngramExtractorSettings.getClusteringMinWordsLongPhrase(), ngramExtractorSettings.getClusteringJaccardCoefficient());
    }

    public CorpusPhraseExtractor(NgramExtractorSettings ngramExtractorSettings, BasicNLPAnalyzerEu basicNLPAnalyzerEu, ResourceObject resourceObject) throws NgramException {
        this.preprocessor = loadSourcePreprocessor(ngramExtractorSettings.getInputSourceType());
        this.extractor = loadExtractor(ngramExtractorSettings.getExtractorType(), ngramExtractorSettings.getNgramMaxSize(), ngramExtractorSettings.getNgramMinCount(), ngramExtractorSettings.getNgramSubgramFactor(), resourceObject, ngramExtractorSettings.getNgramTokenizerType(), basicNLPAnalyzerEu);
        this.phraseClusterer = new PhraseClusterer(ngramExtractorSettings.getInputLanguage(), ngramExtractorSettings.getClusteringMinWordsLongPhrase(), ngramExtractorSettings.getClusteringJaccardCoefficient());
    }

    private static Extractor loadExtractor(PropertyConstantsEnums.ExtractorType extractorType, int i, int i2, double d, Language language, PropertyConstantsEnums.TokenizerType tokenizerType, BasicNLPAnalyzerEu basicNLPAnalyzerEu) throws NgramException {
        if (!PropertyConstantsEnums.ExtractorType.NGRAM.equals(extractorType)) {
            throw new IllegalArgumentException("Unsupported extractor type: " + extractorType);
        }
        LOG.info("Using Ngram-based extractor.");
        return new NgramFinder(i, i2, d, language, tokenizerType, basicNLPAnalyzerEu);
    }

    private static Extractor loadExtractor(PropertyConstantsEnums.ExtractorType extractorType, int i, int i2, double d, ResourceObject resourceObject, PropertyConstantsEnums.TokenizerType tokenizerType, BasicNLPAnalyzerEu basicNLPAnalyzerEu) throws NgramException {
        if (!PropertyConstantsEnums.ExtractorType.NGRAM.equals(extractorType)) {
            throw new IllegalArgumentException("Unsupported extractor type: " + extractorType);
        }
        LOG.info("Using Ngram-based extractor.");
        return new NgramFinder(i, i2, d, resourceObject, tokenizerType, basicNLPAnalyzerEu);
    }

    private static SourcePreprocessor loadSourcePreprocessor(PropertyConstantsEnums.SourceType sourceType) {
        if (!PropertyConstantsEnums.SourceType.TWITTER.equals(sourceType)) {
            throw new IllegalArgumentException("Unsupported source type: " + sourceType);
        }
        LOG.info("Using Twitter-specific preprocessor.");
        return new TwitterPreprocessor();
    }

    public Set<Keyphrase> extractPhrases(Stream<Document> stream) throws NgramException {
        Set<Keyphrase> extractPhrases = this.extractor.extractPhrases(stream.map(CorpusPhraseExtractor$$Lambda$0.$instance).flatMap(new Function(this) { // from class: sec.bdc.tm.hte.eu.ngram.CorpusPhraseExtractor$$Lambda$1
            private final CorpusPhraseExtractor arg$1;

            /* JADX INFO: Access modifiers changed from: package-private */
            {
                this.arg$1 = this;
            }

            @Override // java.util.function.Function
            public Object apply(Object obj) {
                return this.arg$1.lambda$extractPhrases$0$CorpusPhraseExtractor((String) obj);
            }
        }));
        this.phraseClusterer.clusterPhrases(extractPhrases);
        return extractPhrases;
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    public final /* synthetic */ Stream lambda$extractPhrases$0$CorpusPhraseExtractor(String str) {
        return this.preprocessor.preprocessBreaking(str).stream();
    }
}
