package org.apache.any23.plugin.htmlscraper;

import de.l3s.boilerpipe.BoilerpipeExtractor;
import de.l3s.boilerpipe.BoilerpipeProcessingException;
import de.l3s.boilerpipe.extractors.ArticleExtractor;
import de.l3s.boilerpipe.extractors.CanolaExtractor;
import de.l3s.boilerpipe.extractors.DefaultExtractor;
import de.l3s.boilerpipe.extractors.LargestContentExtractor;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.any23.extractor.ExtractionContext;
import org.apache.any23.extractor.ExtractionException;
import org.apache.any23.extractor.ExtractionParameters;
import org.apache.any23.extractor.ExtractionResult;
import org.apache.any23.extractor.Extractor;
import org.apache.any23.extractor.ExtractorDescription;
import org.eclipse.rdf4j.model.IRI;
import org.eclipse.rdf4j.model.impl.SimpleValueFactory;

/* loaded from: input_file:org/apache/any23/plugin/htmlscraper/HTMLScraperExtractor.class */
public class HTMLScraperExtractor implements Extractor.ContentExtractor {
    public static final IRI PAGE_CONTENT_DE_PROPERTY = SimpleValueFactory.getInstance().createIRI("http://vocab.sindice.net/any23#pagecontent/de");
    public static final IRI PAGE_CONTENT_AE_PROPERTY = SimpleValueFactory.getInstance().createIRI("http://vocab.sindice.net/any23#pagecontent/ae");
    public static final IRI PAGE_CONTENT_LCE_PROPERTY = SimpleValueFactory.getInstance().createIRI("http://vocab.sindice.net/any23#pagecontent/lce");
    public static final IRI PAGE_CONTENT_CE_PROPERTY = SimpleValueFactory.getInstance().createIRI("http://vocab.sindice.net/any23#pagecontent/ce");
    private final List<ExtractionRule> extractionRules = new ArrayList();

    /* JADX INFO: Access modifiers changed from: package-private */
    /* loaded from: input_file:org/apache/any23/plugin/htmlscraper/HTMLScraperExtractor$ExtractionRule.class */
    public class ExtractionRule {
        public final String name;
        public final IRI property;
        public final BoilerpipeExtractor boilerpipeExtractor;

        ExtractionRule(String str, IRI iri, BoilerpipeExtractor boilerpipeExtractor) {
            if (str == null) {
                throw new NullPointerException("name cannot be null.");
            }
            if (iri == null) {
                throw new NullPointerException("property cannot be null.");
            }
            if (boilerpipeExtractor == null) {
                throw new NullPointerException("extractor cannot be null.");
            }
            this.name = str;
            this.property = iri;
            this.boilerpipeExtractor = boilerpipeExtractor;
        }
    }

    public HTMLScraperExtractor() {
        loadDefaultRules();
    }

    public void addTextExtractor(String str, IRI iri, BoilerpipeExtractor boilerpipeExtractor) {
        this.extractionRules.add(new ExtractionRule(str, iri, boilerpipeExtractor));
    }

    public String[] getTextExtractors() {
        ArrayList arrayList = new ArrayList();
        Iterator<ExtractionRule> it = this.extractionRules.iterator();
        while (it.hasNext()) {
            arrayList.add(it.next().name);
        }
        return (String[]) arrayList.toArray(new String[arrayList.size()]);
    }

    public void run(ExtractionParameters extractionParameters, ExtractionContext extractionContext, InputStream inputStream, ExtractionResult extractionResult) throws IOException, ExtractionException {
        try {
            IRI documentIRI = extractionContext.getDocumentIRI();
            for (ExtractionRule extractionRule : this.extractionRules) {
                extractionResult.writeTriple(documentIRI, extractionRule.property, SimpleValueFactory.getInstance().createLiteral(extractionRule.boilerpipeExtractor.getText(new InputStreamReader(inputStream))));
            }
        } catch (BoilerpipeProcessingException e) {
            throw new ExtractionException("Error while applying text processor " + ArticleExtractor.class, e);
        }
    }

    public ExtractorDescription getDescription() {
        return HTMLScraperExtractorFactory.getDescriptionInstance();
    }

    public void setStopAtFirstError(boolean z) {
    }

    private void loadDefaultRules() {
        addTextExtractor("default-extractor", PAGE_CONTENT_DE_PROPERTY, DefaultExtractor.getInstance());
        addTextExtractor("article-extractor", PAGE_CONTENT_AE_PROPERTY, ArticleExtractor.getInstance());
        addTextExtractor("large-content-extractor", PAGE_CONTENT_LCE_PROPERTY, LargestContentExtractor.getInstance());
        addTextExtractor("canola-extractor", PAGE_CONTENT_CE_PROPERTY, CanolaExtractor.getInstance());
    }
}
