package org.apache.any23.cli;

import com.beust.jcommander.IStringConverter;
import com.beust.jcommander.Parameter;
import com.beust.jcommander.ParameterException;
import com.beust.jcommander.Parameters;
import com.beust.jcommander.converters.FileConverter;
import edu.uci.ics.crawler4j.crawler.Page;
import edu.uci.ics.crawler4j.parser.HtmlParseData;
import java.io.File;
import java.net.URL;
import java.util.UUID;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import org.apache.any23.plugin.crawler.CrawlerListener;
import org.apache.any23.plugin.crawler.SiteCrawler;
import org.apache.any23.source.StringDocumentSource;

@Parameters(commandNames = {"crawler"}, commandDescription = "Any23 Crawler Command Line Tool.")
/* loaded from: input_file:org/apache/any23/cli/Crawler.class */
public class Crawler extends Rover {
    private final Object roverLock = new Object();

    @Parameter(names = {"-pf", "--pagefilter"}, description = "Regex used to filter out page URLs during crawling.", converter = PatterConverter.class)
    private Pattern pageFilter = Pattern.compile(SiteCrawler.DEFAULT_PAGE_FILTER_RE);

    @Parameter(names = {"-sf", "--storagefolder"}, description = "Folder used to store crawler temporary data.", converter = FileConverter.class)
    private File storageFolder = new File(System.getProperty("java.io.tmpdir"), "crawler-metadata-" + UUID.randomUUID().toString());

    @Parameter(names = {"-nc", "--numcrawlers"}, description = "Sets the number of crawlers.")
    private int numCrawlers = 10;

    @Parameter(names = {"-mp", "--maxpages"}, description = "Max number of pages before interrupting crawl.")
    private int maxPages = Integer.MAX_VALUE;

    @Parameter(names = {"-md", "--maxdepth"}, description = "Max allowed crawler depth.")
    private int maxDepth = Integer.MAX_VALUE;

    @Parameter(names = {"-pd", "--politenessdelay"}, description = "Politeness delay in milliseconds.")
    private int politenessDelay = Integer.MAX_VALUE;

    /* loaded from: input_file:org/apache/any23/cli/Crawler$PatterConverter.class */
    public static final class PatterConverter implements IStringConverter<Pattern> {
        /* renamed from: convert, reason: merged with bridge method [inline-methods] */
        public Pattern m0convert(String str) {
            try {
                return Pattern.compile(str);
            } catch (PatternSyntaxException e) {
                throw new ParameterException(String.format("Invalid page filter, '%s' must be a regular expression.", str));
            }
        }
    }

    public void run() throws Exception {
        super.configure();
        if (this.inputURIs.size() != 1) {
            throw new IllegalArgumentException("Expected just one seed.");
        }
        URL url = new URL((String) this.inputURIs.get(0));
        if (this.storageFolder.isFile()) {
            throw new IllegalStateException(String.format("Storage folder %s can not be a file, must be a directory", this.storageFolder));
        }
        if (!this.storageFolder.exists() && !this.storageFolder.mkdirs()) {
            throw new IllegalStateException(String.format("Storage folder %s can not be created, please verify you have enough permissions", this.storageFolder));
        }
        SiteCrawler siteCrawler = new SiteCrawler(this.storageFolder);
        siteCrawler.setNumOfCrawlers(this.numCrawlers);
        siteCrawler.setMaxPages(this.maxPages);
        siteCrawler.setMaxDepth(this.maxDepth);
        siteCrawler.setPolitenessDelay(this.politenessDelay);
        siteCrawler.addListener(new CrawlerListener() { // from class: org.apache.any23.cli.Crawler.1
            @Override // org.apache.any23.plugin.crawler.CrawlerListener
            public void visitedPage(Page page) {
                String url2 = page.getWebURL().getURL();
                System.err.println(String.format("Processing page: [%s]", url2));
                HtmlParseData parseData = page.getParseData();
                if (parseData instanceof HtmlParseData) {
                    HtmlParseData htmlParseData = parseData;
                    try {
                        synchronized (Crawler.this.roverLock) {
                            Crawler.super.performExtraction(new StringDocumentSource(htmlParseData.getHtml(), url2));
                        }
                    } catch (Exception e) {
                        System.err.println(String.format("Error while processing page [%s], error: %s .", url2, e.getMessage()));
                    }
                }
            }
        });
        Runtime.getRuntime().addShutdownHook(new Thread() { // from class: org.apache.any23.cli.Crawler.2
            @Override // java.lang.Thread, java.lang.Runnable
            public void run() {
                try {
                    System.err.println(Crawler.super.printReports());
                } catch (Exception e) {
                    e.printStackTrace(System.err);
                }
            }
        });
        siteCrawler.start(url, this.pageFilter, true);
    }
}
