package cn.edu.hfut.dmic.htmlbot.contentextractor;

import cn.edu.hfut.dmic.htmlbot.DomPage;
import cn.edu.hfut.dmic.htmlbot.HtmlBot;
import cn.edu.hfut.dmic.htmlbot.util.GaussSmooth;
import cn.edu.hfut.dmic.htmlbot.util.JsoupHelper;
import cn.edu.hfut.dmic.htmlbot.util.TextUtils;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.NodeVisitor;

/* loaded from: classes.dex */
public class ContentExtractor {
    private Document doc;
    private DomPage domPage;
    private double threshold;
    private ArrayList<TextNode> tNodeList = new ArrayList<>();
    private HashMap<TextNode, String> xpathMap = new HashMap<>();
    private HashMap<String, ArrayList<CountInfo>> countMap = new HashMap<>();
    private HashMap<String, ComputeInfo> computeMap = new HashMap<>();
    private ArrayList<Double> etprList = new ArrayList<>();
    private ArrayList<Double> gaussEtprList = new ArrayList<>();

    /* loaded from: classes.dex */
    public static class ComputeInfo {
        double cs;
        double etpr;
        double ppr;
        double ps;
        double tpr;

        public ComputeInfo(double d, double d2, double d3, double d4) {
            this.tpr = d;
            this.ppr = d2;
            this.cs = d3;
            this.ps = d4;
            this.etpr = d * d2 * d3 * d4;
        }
    }

    /* loaded from: classes.dex */
    public static class CountInfo {
        public int puncCount;
        TextNode tNode;
        public int textCount;

        public CountInfo(TextNode textNode) {
            this.tNode = textNode;
            String text = textNode.text();
            this.textCount = TextUtils.countText(text);
            this.puncCount = TextUtils.countPunc(text);
        }
    }

    public ContentExtractor(DomPage domPage) {
        this.domPage = domPage;
        this.doc = domPage.getDoc();
        clean();
        buildHisto();
    }

    /* JADX INFO: Access modifiers changed from: private */
    public void addTextNode(TextNode textNode) {
        if (textNode.text().trim().isEmpty()) {
            return;
        }
        String xpath = JsoupHelper.getXpath(textNode);
        this.tNodeList.add(textNode);
        this.xpathMap.put(textNode, xpath);
        CountInfo countInfo = new CountInfo(textNode);
        ArrayList<CountInfo> arrayList = this.countMap.get(xpath);
        if (arrayList == null) {
            arrayList = new ArrayList<>();
            this.countMap.put(xpath, arrayList);
        }
        arrayList.add(countInfo);
    }

    private void buildHisto() {
        this.doc.traverse(new NodeVisitor() { // from class: cn.edu.hfut.dmic.htmlbot.contentextractor.ContentExtractor.1
            @Override // org.jsoup.select.NodeVisitor
            public void head(Node node, int i) {
                if (node instanceof TextNode) {
                    ContentExtractor.this.addTextNode((TextNode) node);
                }
            }

            @Override // org.jsoup.select.NodeVisitor
            public void tail(Node node, int i) {
            }
        });
        for (Map.Entry<String, ArrayList<CountInfo>> entry : this.countMap.entrySet()) {
            this.computeMap.put(entry.getKey(), getComputeInfo(entry.getValue()));
        }
        Iterator<TextNode> it = this.tNodeList.iterator();
        while (it.hasNext()) {
            this.etprList.add(Double.valueOf(this.computeMap.get(this.xpathMap.get(it.next())).etpr));
        }
        this.gaussEtprList = GaussSmooth.gaussSmooth(this.etprList, 1);
        this.threshold = computeThreshold();
    }

    private void clean() {
        this.doc.select("script").remove();
        this.doc.select("style").remove();
        this.doc.select("iframe").remove();
    }

    private double computeDeviation(ArrayList<Double> arrayList) {
        if (arrayList.size() == 0) {
            return 0.0d;
        }
        double d = 0.0d;
        Iterator<Double> it = arrayList.iterator();
        while (it.hasNext()) {
            d += it.next().doubleValue();
        }
        double size = d / arrayList.size();
        double d2 = 0.0d;
        Iterator<Double> it2 = arrayList.iterator();
        while (it2.hasNext()) {
            Double next = it2.next();
            d2 += (next.doubleValue() - size) * (next.doubleValue() - size);
        }
        return Math.sqrt(d2 / arrayList.size());
    }

    private double computeThreshold() {
        return 0.8d * computeDeviation(this.gaussEtprList);
    }

    private ComputeInfo getComputeInfo(ArrayList<CountInfo> arrayList) {
        double d = 0.0d;
        double d2 = 0.0d;
        ArrayList<Double> arrayList2 = new ArrayList<>();
        ArrayList<Double> arrayList3 = new ArrayList<>();
        Iterator<CountInfo> it = arrayList.iterator();
        while (it.hasNext()) {
            CountInfo next = it.next();
            d += next.textCount;
            d2 += next.puncCount;
            arrayList2.add(Double.valueOf(next.textCount + 0.0d));
            arrayList3.add(Double.valueOf(next.puncCount + 0.0d));
        }
        return new ComputeInfo(d / arrayList.size(), d2 / arrayList.size(), computeDeviation(arrayList2), computeDeviation(arrayList3));
    }

    public static String getContentByHtml(String str) throws Exception {
        return new ContentExtractor(HtmlBot.getDomPageByHtml(str)).getContent();
    }

    public static String getContentByURL(String str) throws Exception {
        return new ContentExtractor(HtmlBot.getDomPageByURL(str)).getContent();
    }

    public static void main(String[] strArr) throws Exception {
        System.out.println(getContentByURL("http://news.xinhuanet.com/world/2014-11/02/c_127166728.htm"));
    }

    public String getContent() {
        StringBuilder sb = new StringBuilder();
        for (int i = 0; i < this.tNodeList.size(); i++) {
            TextNode textNode = this.tNodeList.get(i);
            if (this.gaussEtprList.get(i).doubleValue() > this.threshold) {
                sb.append(textNode.text().trim() + "\n");
            }
        }
        return sb.toString();
    }
}
