package sec.bdc.tm.hte.eu.ngram.clustering;

import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.function.Function;
import java.util.function.Predicate;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import sec.bdc.nlp.Language;
import sec.bdc.tm.hte.eu.ngram.structures.Keyphrase;
import sec.bdc.tm.hte.eu.ngram.utils.ValueComparator;

/* loaded from: classes49.dex */
public class PhraseClusterer {
    private static final Logger LOG = LoggerFactory.getLogger((Class<?>) PhraseClusterer.class);
    private final Language language;
    private final double minJaccardCoefficient;
    private final int minWordsLongPhrase;

    public PhraseClusterer(Language language, int i, double d) {
        this.language = language;
        this.minWordsLongPhrase = i;
        this.minJaccardCoefficient = d;
    }

    private static Set<Set<String>> findAcronymSynonyms(Set<String> set) {
        HashMap newHashMap = Maps.newHashMap();
        for (String str : set) {
            String[] split = str.split(StringUtils.SPACE);
            if (split.length >= 2) {
                StringBuilder sb = new StringBuilder();
                for (String str2 : split) {
                    if (str2.length() > 1) {
                        sb.append(str2.substring(0, 1));
                    }
                }
                String sb2 = sb.toString();
                if (set.contains(sb2)) {
                    newHashMap.putIfAbsent(sb2, Sets.newHashSet());
                    ((Set) newHashMap.get(sb2)).add(str);
                }
            }
        }
        newHashMap.entrySet().removeIf(PhraseClusterer$$Lambda$8.$instance);
        return (Set) newHashMap.entrySet().stream().map(PhraseClusterer$$Lambda$9.$instance).collect(Collectors.toSet());
    }

    private static Set<Set<String>> findConcatenationSynonyms(Set<String> set) {
        HashMap newHashMap = Maps.newHashMap();
        for (String str : set) {
            String replaceAll = str.replaceAll(StringUtils.SPACE, "");
            newHashMap.putIfAbsent(replaceAll, Sets.newHashSet());
            ((Set) newHashMap.get(replaceAll)).add(str);
        }
        return Sets.newHashSet(newHashMap.values());
    }

    private Set<Set<String>> findLongSubsetSynonyms(Set<String> set) {
        Set set2 = (Set) set.stream().filter(new Predicate(this) { // from class: sec.bdc.tm.hte.eu.ngram.clustering.PhraseClusterer$$Lambda$10
            private final PhraseClusterer arg$1;

            /* JADX INFO: Access modifiers changed from: package-private */
            {
                this.arg$1 = this;
            }

            @Override // java.util.function.Predicate
            public boolean test(Object obj) {
                return this.arg$1.lambda$findLongSubsetSynonyms$5$PhraseClusterer((String) obj);
            }
        }).collect(Collectors.toSet());
        FindAndUnion<String> findAndUnion = new FindAndUnion<>(set2);
        ArrayList newArrayList = Lists.newArrayList(set2);
        for (int i = 0; i < newArrayList.size(); i++) {
            for (int i2 = i + 1; i2 < newArrayList.size(); i2++) {
                unionIfIntersectSufficiently(findAndUnion, (String) newArrayList.get(i), (String) newArrayList.get(i2));
            }
        }
        return findAndUnion.asSetOfSets();
    }

    private static Set<Set<String>> findOrderingSynonyms(Set<String> set) {
        HashMap newHashMap = Maps.newHashMap();
        for (String str : set) {
            HashSet newHashSet = Sets.newHashSet(str.split(StringUtils.SPACE));
            if (newHashSet.size() > 1) {
                newHashMap.put(str, newHashSet);
            }
        }
        return Sets.newHashSet(((Map) newHashMap.entrySet().stream().collect(Collectors.groupingBy(PhraseClusterer$$Lambda$6.$instance, Collectors.mapping(PhraseClusterer$$Lambda$7.$instance, Collectors.toSet())))).values());
    }

    private static Set<Set<String>> findSubsequenceSynonyms(Set<String> set) {
        HashMap newHashMap = Maps.newHashMap();
        for (String str : set) {
            List asList = Arrays.asList(str.split(StringUtils.SPACE));
            if (asList.size() >= 2) {
                String join = String.join(StringUtils.SPACE, asList.subList(1, asList.size()));
                if (set.contains(join)) {
                    newHashMap.putIfAbsent(join, Sets.newHashSet());
                    ((Set) newHashMap.get(join)).add(str);
                }
            }
        }
        newHashMap.entrySet().removeIf(PhraseClusterer$$Lambda$11.$instance);
        return (Set) newHashMap.entrySet().stream().map(PhraseClusterer$$Lambda$12.$instance).collect(Collectors.toSet());
    }

    private String getBestClusterRepresentative(Set<String> set, Map<String, Integer> map) {
        Stream<String> stream = set.stream();
        Function function = PhraseClusterer$$Lambda$4.$instance;
        map.getClass();
        return (String) ValueComparator.sortedKeys((Map) stream.collect(Collectors.toMap(function, PhraseClusterer$$Lambda$5.get$Lambda(map))), false).first();
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    public static final /* synthetic */ Keyphrase lambda$clusterPhrases$0$PhraseClusterer(Keyphrase keyphrase) {
        return keyphrase;
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    public static final /* synthetic */ boolean lambda$findAcronymSynonyms$3$PhraseClusterer(Map.Entry entry) {
        return ((Set) entry.getValue()).size() != 1;
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    public static final /* synthetic */ boolean lambda$findSubsequenceSynonyms$6$PhraseClusterer(Map.Entry entry) {
        return ((Set) entry.getValue()).size() != 1;
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    public static final /* synthetic */ String lambda$getBestClusterRepresentative$2$PhraseClusterer(String str) {
        return str;
    }

    private void mergePhrases(Set<Keyphrase> set, Map<String, Keyphrase> map, Set<Set<String>> set2) {
        Map<String, Integer> map2 = (Map) map.entrySet().stream().collect(Collectors.toMap(PhraseClusterer$$Lambda$2.$instance, PhraseClusterer$$Lambda$3.$instance));
        for (Set<String> set3 : set2) {
            String bestClusterRepresentative = getBestClusterRepresentative(set3, map2);
            Keyphrase keyphrase = map.get(bestClusterRepresentative);
            for (String str : set3) {
                if (!str.equals(bestClusterRepresentative)) {
                    Keyphrase keyphrase2 = map.get(str);
                    keyphrase.addSynonym(keyphrase2);
                    set.remove(keyphrase2);
                }
            }
        }
    }

    private static void union(FindAndUnion<String> findAndUnion, Set<Set<String>> set) {
        for (Set<String> set2 : set) {
            Optional<String> findFirst = set2.stream().findFirst();
            if (findFirst.isPresent()) {
                String find = findAndUnion.find(findFirst.get());
                Iterator<String> it = set2.iterator();
                while (it.hasNext()) {
                    findAndUnion.union(find, findAndUnion.find(it.next()));
                }
            }
        }
    }

    private void unionIfIntersectSufficiently(FindAndUnion<String> findAndUnion, String str, String str2) {
        HashSet newHashSet = Sets.newHashSet(str.split(StringUtils.SPACE));
        HashSet newHashSet2 = Sets.newHashSet(str2.split(StringUtils.SPACE));
        if ((Sets.intersection(newHashSet, newHashSet2).size() * 1.0d) / Sets.union(newHashSet, newHashSet2).size() >= this.minJaccardCoefficient) {
            findAndUnion.union(findAndUnion.find(str), findAndUnion.find(str2));
        }
    }

    public Set<Set<String>> clusterPhraseTexts(Set<String> set) {
        StringNormalizer stringNormalizer = new StringNormalizer(set, this.language);
        Set<String> normalizedStrings = stringNormalizer.getNormalizedStrings();
        FindAndUnion findAndUnion = new FindAndUnion(normalizedStrings);
        union(findAndUnion, findConcatenationSynonyms(normalizedStrings));
        union(findAndUnion, findSubsequenceSynonyms(normalizedStrings));
        union(findAndUnion, findAcronymSynonyms(normalizedStrings));
        union(findAndUnion, findOrderingSynonyms(normalizedStrings));
        union(findAndUnion, findLongSubsetSynonyms(normalizedStrings));
        return stringNormalizer.unnormalizeClusters(findAndUnion.asSetOfSets());
    }

    public void clusterPhrases(Set<Keyphrase> set) {
        LOG.info("Before clustering: {} phrases.", Integer.valueOf(set.size()));
        Map<String, Keyphrase> map = (Map) set.stream().collect(Collectors.toMap(PhraseClusterer$$Lambda$0.$instance, PhraseClusterer$$Lambda$1.$instance));
        Set<Set<String>> clusterPhraseTexts = clusterPhraseTexts(map.keySet());
        LOG.info("{} clusters created.", Integer.valueOf(clusterPhraseTexts.size()));
        mergePhrases(set, map, clusterPhraseTexts);
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    public final /* synthetic */ boolean lambda$findLongSubsetSynonyms$5$PhraseClusterer(String str) {
        return str.split(StringUtils.SPACE).length >= this.minWordsLongPhrase;
    }
}
