package TxtParserPackage.extractors.HTML_DataGrabber;

import NetworkPackage.HttpServer.HttpSimpleClient;
import TxtParserPackage.AutomaticTextParser;
import java.lang.reflect.Array;
import java.util.Vector;

/* loaded from: classes.dex */
public class HTMLDataGrabber {
    public static boolean DEBUG = false;
    public static String HTML_ERROR = HttpSimpleClient.HTML_ERROR;
    public Vector accessedURL = new Vector();
    public int treeDeep = 0;
    public int treeDeepMax = 3;
    public Vector dropItemList = new Vector();
    public Vector result = new Vector();

    private void extractParagraphContent(String str, String str2, boolean z, String str3, String str4, String str5, String str6) {
        String[] splitAlsoBlankToken = AutomaticTextParser.splitAlsoBlankToken(str, "<li>");
        if (str6.equals("")) {
            str6 = str5;
        }
        if (splitAlsoBlankToken.length > 1) {
            for (int i = 1; i < splitAlsoBlankToken.length; i++) {
                if (splitAlsoBlankToken[i].indexOf("<a ") != -1 && splitAlsoBlankToken[i].indexOf("title=\"") != -1 && splitAlsoBlankToken[i].indexOf("href=\"") != -1) {
                    String titleName = getTitleName(splitAlsoBlankToken[i], "li");
                    String str7 = String.valueOf(str2) + AutomaticTextParser.getInnerFirstOccurence(splitAlsoBlankToken[i], "href=\"", "\"");
                    if (includeThisItemName(titleName)) {
                        this.result.add(new String[]{str4, str5, str6, titleName, str7, titleName});
                        System.out.println("\t\t" + str3 + "[*] ." + this.treeDeep + " " + titleName + " (" + str7 + ")");
                        if (z) {
                            extractFromHtml(str7, true, titleName);
                        }
                    }
                } else if (splitAlsoBlankToken[i].indexOf("</li>") != -1) {
                    String titleName2 = getTitleName(splitAlsoBlankToken[i], "li");
                    if (includeThisItemName(titleName2)) {
                        this.result.add(new String[]{str4, str5, str6, titleName2, "", titleName2});
                        System.out.println("\t\t" + str3 + "[*] ." + this.treeDeep + " " + titleName2 + " (NO LINK)");
                    }
                }
            }
            return;
        }
        String[] splitAlsoBlankToken2 = AutomaticTextParser.splitAlsoBlankToken(str, "<p>");
        for (int i2 = 0; i2 < splitAlsoBlankToken2.length; i2++) {
            String str8 = splitAlsoBlankToken2[i2];
            if (str8.indexOf("</p>") != -1) {
                str8 = splitAlsoBlankToken2[i2].substring(0, splitAlsoBlankToken2[i2].indexOf("</p>"));
            }
            String depurateFromHtmlTag = depurateFromHtmlTag(str8);
            if ((depurateFromHtmlTag.indexOf("<") == -1 || depurateFromHtmlTag.indexOf(">") == -1) && !depurateFromHtmlTag.equals("") && depurateFromHtmlTag.indexOf("[edit]") == -1) {
                String summarizeStringContent = AutomaticTextParser.summarizeStringContent(depurateFromHtmlTag, 30);
                this.result.add(new String[]{str4, str5, str6, summarizeStringContent, "", depurateFromHtmlTag});
                System.out.println("\t\t" + str3 + "[*] ." + this.treeDeep + " " + summarizeStringContent + " (TXT)");
            } else if (DEBUG && !depurateFromHtmlTag.equals("")) {
                System.out.println("DISCARD: " + depurateFromHtmlTag);
            }
        }
    }

    private String getTitleName(String str, String str2) {
        String str3 = "UNKNOWN";
        if (str.indexOf("</" + str2 + ">") != -1) {
            str = str.substring(0, str.indexOf("</" + str2 + ">"));
            str3 = str;
        }
        if (str.indexOf("title=\"") != -1) {
            str3 = AutomaticTextParser.getInnerFirstOccurence(str, "title=\"", "\"");
        }
        if (str3.indexOf(":") != -1) {
            str3 = str3.substring(str3.indexOf(":") + 1).trim();
        }
        return depurateFromHtmlTag(str3);
    }

    private boolean includeThisItemName(String str) {
        return AutomaticTextParser.isInVector(str, this.dropItemList) == -1 && str.indexOf("http:") == -1;
    }

    public static void main(String[] strArr) {
        String str = strArr[0];
        String str2 = strArr[1];
        String str3 = strArr[2];
        if (strArr.length > 3) {
            HttpSimpleClient.setProxyConfiguration(strArr[3], Integer.parseInt(strArr[4]));
        }
        HTMLDataGrabber hTMLDataGrabber = new HTMLDataGrabber();
        hTMLDataGrabber.dropItemList.add("See also");
        hTMLDataGrabber.dropItemList.add("References");
        hTMLDataGrabber.dropItemList.add("External links");
        hTMLDataGrabber.dropItemList.add("Wikipedia (en)");
        hTMLDataGrabber.dropItemList.add(">Contents");
        hTMLDataGrabber.extractFromHtml(str2, true, str);
    }

    public String depurateFromHtmlTag(String str) {
        String trim = AutomaticTextParser.replaceString(str, "&NBSP;", " ").trim();
        if (trim.startsWith(">") && trim.length() > 1) {
            trim = trim.substring(1, trim.length());
        }
        return AutomaticTextParser.replaceStringBlock(trim, "<", ">", "").trim();
    }

    public void extractFromHtml(String str, boolean z, String str2) {
        String str3 = "";
        String str4 = "";
        if (AutomaticTextParser.isInVector(str, this.accessedURL) == -1 && this.treeDeep < this.treeDeepMax) {
            if (DEBUG && this.accessedURL.size() > 10) {
                this.accessedURL.add(str);
                return;
            }
            for (int i = 0; i < this.treeDeep; i++) {
                str3 = String.valueOf(str3) + "\t\t\t";
            }
            String grabPage = HttpSimpleClient.grabPage(str);
            String homeFromURL = HttpSimpleClient.getHomeFromURL(str);
            this.accessedURL.add(str);
            this.treeDeep++;
            String[] splitAlsoBlankToken = AutomaticTextParser.splitAlsoBlankToken(grabPage, "<h2");
            if (splitAlsoBlankToken.length > 0) {
                for (int i2 = 0; i2 < splitAlsoBlankToken.length; i2++) {
                    String titleName = getTitleName(splitAlsoBlankToken[i2], "h2");
                    if (includeThisItemName(titleName)) {
                        System.out.println(String.valueOf(str3) + "H2." + this.treeDeep + "-" + titleName);
                        String[] splitAlsoBlankToken2 = AutomaticTextParser.splitAlsoBlankToken(splitAlsoBlankToken[i2], "<h3");
                        if (splitAlsoBlankToken2.length > 1) {
                            for (int i3 = 1; i3 < splitAlsoBlankToken2.length; i3++) {
                                String titleName2 = getTitleName(splitAlsoBlankToken2[i3], "h3");
                                str4 = titleName2;
                                if (includeThisItemName(titleName2)) {
                                    System.out.println("\t" + str3 + "H3." + this.treeDeep + "-" + titleName2);
                                    extractParagraphContent(splitAlsoBlankToken2[i3], homeFromURL, z, str3, str2, titleName, str4);
                                }
                            }
                        } else {
                            extractParagraphContent(splitAlsoBlankToken[i2], homeFromURL, z, str3, str2, titleName, str4);
                        }
                    }
                }
            }
            this.treeDeep--;
        }
    }

    public String[][] extractFromHtml(String str, String str2, String str3, String str4, String str5, String str6, int i) {
        String[][] strArr = null;
        String upperCase = str.toUpperCase();
        String upperCase2 = str2.toUpperCase();
        String upperCase3 = str3.toUpperCase();
        String upperCase4 = str4.toUpperCase();
        String upperCase5 = str5.toUpperCase();
        String upperCase6 = str6.toUpperCase();
        if (DEBUG) {
            System.out.println("DEBUG extractFromHtml on page of " + upperCase.length() + " chrs");
        }
        int indexOf = upperCase.indexOf(upperCase2);
        if (indexOf != -1) {
            String substring = upperCase.substring(upperCase2.length() + indexOf);
            int indexOf2 = substring.indexOf(upperCase3);
            if (indexOf2 != -1) {
                substring = substring.substring(0, indexOf2);
            }
            String[] splitAlsoBlankToken = AutomaticTextParser.splitAlsoBlankToken(substring, upperCase4);
            if (DEBUG) {
                System.out.println("DEBUG extractFromHtml on page of " + substring.length() + " chrs\nhas been divided into " + splitAlsoBlankToken.length + " rows using \"" + upperCase4 + "\"");
            }
            if (splitAlsoBlankToken.length > 1) {
                boolean z = true;
                for (int i2 = 1; i2 < splitAlsoBlankToken.length; i2++) {
                    if (splitAlsoBlankToken[i2].trim().length() > 0) {
                        String trim = splitAlsoBlankToken[i2].trim();
                        if (trim.indexOf(upperCase5) != -1) {
                            if (DEBUG) {
                                System.out.println("ROW:" + i2 + " " + trim);
                            }
                            if (z) {
                                strArr = (String[][]) Array.newInstance((Class<?>) String.class, splitAlsoBlankToken.length - 1, AutomaticTextParser.countTokenOccurrency(trim, upperCase5) + 1);
                                for (int i3 = 0; i3 < strArr.length; i3++) {
                                    for (int i4 = 0; i4 < strArr[i3].length; i4++) {
                                        strArr[i3][i4] = "";
                                    }
                                }
                                z = false;
                            }
                            String[] splitAlsoBlankToken2 = AutomaticTextParser.splitAlsoBlankToken(trim, upperCase5);
                            if (DEBUG) {
                                System.out.println("The row has been divided into " + splitAlsoBlankToken2.length + " cols using \"" + upperCase5 + "\" and starting from " + i);
                            }
                            for (int i5 = i; i5 < splitAlsoBlankToken2.length; i5++) {
                                if (DEBUG) {
                                    System.out.println("*******************\nELEM:" + i5 + "\n" + splitAlsoBlankToken2[i5].trim());
                                }
                                if (i5 - i < strArr[i2 - 1].length) {
                                    if (upperCase5.startsWith("<")) {
                                        strArr[i2 - 1][i5 - i] = depurateFromHtmlTag(AutomaticTextParser.getInnerFirstOccurence(splitAlsoBlankToken2[i5], ">", upperCase6));
                                    } else if (upperCase6.equals("")) {
                                        strArr[i2 - 1][i5 - i] = depurateFromHtmlTag(splitAlsoBlankToken2[i5]);
                                    } else {
                                        strArr[i2 - 1][i5 - i] = depurateFromHtmlTag(splitAlsoBlankToken2[i5].substring(0, splitAlsoBlankToken2[i5].indexOf(upperCase6)));
                                    }
                                    if (DEBUG) {
                                        System.out.println("has depurated from html tags into:\n" + strArr[i2 - 1][i5 - i] + "\n*******************\n");
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
        return strArr;
    }

    public String[][] extractTableFromHtml(String str, String str2, String str3) {
        return extractFromHtml(str, str2, str3, "<TR", "<TD", "</TD>", 1);
    }

    public String getRemotePage(String str) {
        return HttpSimpleClient.grabPage(str);
    }

    public String getRemotePage(String str, String str2) {
        return HttpSimpleClient.grabPage(str, str2);
    }
}
