package de.tudarmstadt.ukp.dkpro.core.io.tei;

import de.tudarmstadt.ukp.dkpro.core.api.io.ResourceCollectionReaderBase;
import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS;
import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData;
import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity;
import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider;
import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent;
import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.ROOT;
import de.tudarmstadt.ukp.dkpro.core.io.tei.internal.TeiConstants;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.Stack;
import java.util.zip.GZIPInputStream;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.CASException;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.ResourceMetaData;
import org.apache.uima.fit.descriptor.TypeCapability;
import org.apache.uima.fit.internal.EnhancedClassFile;
import org.apache.uima.fit.util.FSCollectionFactory;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.Logger;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.Element;
import org.dom4j.io.SAXReader;
import org.dom4j.io.SAXWriter;
import org.jaxen.JaxenException;
import org.jaxen.dom4j.Dom4jXPath;
import org.maltparser.core.syntaxgraph.RootLabels;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

@TypeCapability(outputs = {"de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma", "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS", "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent", "de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity"})
@ResourceMetaData(name = "de.tudarmstadt.ukp.dkpro.core.io.tei.TeiReader", description = "Reader for the TEI XML.", version = "1.8.0", vendor = "DKPro Core Project", copyright = "Copyright 2010-2015\n                            Ubiquitous Knowledge Processing (UKP) Lab\n                            Technische Universität Darmstadt")
@EnhancedClassFile
/* loaded from: input_file:de/tudarmstadt/ukp/dkpro/core/io/tei/TeiReader.class */
public class TeiReader extends ResourceCollectionReaderBase {
    public static final String PARAM_READ_TOKEN = "readToken";

    @ConfigurationParameter(name = "readToken", mandatory = true, defaultValue = {"true"}, description = "Write token annotations to the CAS.")
    private boolean readToken;
    public static final String PARAM_READ_POS = "readPOS";

    @ConfigurationParameter(name = "readPOS", mandatory = true, defaultValue = {"true"}, description = "Write part-of-speech annotations to the CAS.")
    private boolean readPOS;
    public static final String PARAM_READ_LEMMA = "readLemma";

    @ConfigurationParameter(name = "readLemma", mandatory = true, defaultValue = {"true"}, description = "Write lemma annotations to the CAS.")
    private boolean readLemma;
    public static final String PARAM_READ_SENTENCE = "readSentence";

    @ConfigurationParameter(name = "readSentence", mandatory = true, defaultValue = {"true"}, description = "Write sentence annotations to the CAS.")
    private boolean readSentence;
    public static final String PARAM_READ_CONSTITUENT = "readConstituent";

    @ConfigurationParameter(name = "readConstituent", mandatory = true, defaultValue = {"true"}, description = "Write constituent annotations to the CAS.")
    private boolean readConstituent;
    public static final String PARAM_READ_NAMED_ENTITY = "readNamedEntity";

    @ConfigurationParameter(name = "readNamedEntity", mandatory = true, defaultValue = {"true"}, description = "Write named entity annotations to the CAS.")
    private boolean readNamedEntity;
    public static final String PARAM_READ_PARAGRAPH = "readParagraph";

    @ConfigurationParameter(name = "readParagraph", mandatory = true, defaultValue = {"true"}, description = "Write paragraphs annotations to the CAS.")
    private boolean readParagraph;
    public static final String PARAM_USE_XML_ID = "useXmlId";

    @ConfigurationParameter(name = PARAM_USE_XML_ID, mandatory = true, defaultValue = {"false"}, description = "Use the xml:id attribute on the TEI elements as document ID. Mind that many TEI files\nmay not have this attribute on all TEI elements and you may end up with no document ID\nat all. Also mind that the IDs should be unique.")
    private boolean useXmlId;
    public static final String PARAM_USE_FILENAME_ID = "useFilenameId";

    @ConfigurationParameter(name = PARAM_USE_FILENAME_ID, mandatory = true, defaultValue = {"false"}, description = "When not using the XML ID, use only the filename instead of the whole URL as ID. Mind that\nthe filenames should be unique in this case.")
    private boolean useFilenameId;
    public static final String PARAM_OMIT_IGNORABLE_WHITESPACE = "omitIgnorableWhitespace";

    @ConfigurationParameter(name = PARAM_OMIT_IGNORABLE_WHITESPACE, mandatory = true, defaultValue = {"false"}, description = "Do not write <em>ignoreable whitespace</em> from the XML file to the CAS.")
    private boolean omitIgnorableWhitespace;
    public static final String PARAM_POS_MAPPING_LOCATION = "POSMappingLocation";

    @ConfigurationParameter(name = "POSMappingLocation", mandatory = false, description = "Location of the mapping file for part-of-speech tags to UIMA types.")
    protected String mappingPosLocation;
    public static final String PARAM_POS_TAG_SET = "POSTagSet";

    @ConfigurationParameter(name = "POSTagSet", mandatory = false, description = "Use this part-of-speech tag set to use to resolve the tag set mapping instead of using the\ntag set defined as part of the model meta data. This can be useful if a custom model is\nspecified which does not have such meta data, or it can be used in readers.")
    protected String posTagset;
    public static final String PARAM_UTTERANCES_AS_SENTENCES = "utterancesAsSentences";

    @ConfigurationParameter(name = PARAM_UTTERANCES_AS_SENTENCES, mandatory = true, defaultValue = {"false"}, description = "Interpret utterances \"u\" as sentenes \"s\". (EXPERIMENTAL)")
    private boolean utterancesAsSentences;
    private Iterator<Element> teiElementIterator;
    private Element currentTeiElement;
    private ResourceCollectionReaderBase.Resource currentResource;
    private int currentTeiElementNumber;
    private MappingProvider posMappingProvider;

    /* loaded from: input_file:de/tudarmstadt/ukp/dkpro/core/io/tei/TeiReader$ConstituentWrapper.class */
    private static class ConstituentWrapper {
        public Constituent constituent;
        public List<Annotation> children = new ArrayList();

        public ConstituentWrapper(Constituent constituent) {
            this.constituent = constituent;
        }
    }

    /* loaded from: input_file:de/tudarmstadt/ukp/dkpro/core/io/tei/TeiReader$Handler.class */
    protected static abstract class Handler extends DefaultHandler {
        private JCas jcas;
        private Logger logger;

        protected Handler() {
        }

        public void setJCas(JCas jCas) {
            this.jcas = jCas;
        }

        protected JCas getJCas() {
            return this.jcas;
        }

        public void setLogger(Logger logger) {
            this.logger = logger;
        }

        public Logger getLogger() {
            return this.logger;
        }
    }

    /* loaded from: input_file:de/tudarmstadt/ukp/dkpro/core/io/tei/TeiReader$TeiHandler.class */
    public class TeiHandler extends Handler {
        private String documentId = null;
        private boolean titleSet = false;
        private boolean inTextElement = false;
        private boolean captureText = false;
        private int paragraphStart = -1;
        private int sentenceStart = -1;
        private int tokenStart = -1;
        private String posTag = null;
        private String lemma = null;
        private Stack<ConstituentWrapper> constituents = new Stack<>();
        private Stack<NamedEntity> namedEntities = new Stack<>();
        private final StringBuilder buffer = new StringBuilder();

        public TeiHandler() {
        }

        @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
        public void endDocument() throws SAXException {
            getJCas().setDocumentText(this.buffer.toString());
        }

        protected StringBuilder getBuffer() {
            return this.buffer;
        }

        @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
        public void startElement(String str, String str2, String str3, Attributes attributes) throws SAXException {
            if (!this.inTextElement && TeiConstants.TAG_TEI_DOC.equals(str3)) {
                if (TeiReader.this.useXmlId) {
                    this.documentId = attributes.getValue("xml:id");
                    return;
                } else if (TeiReader.this.useFilenameId) {
                    this.documentId = FilenameUtils.getName(TeiReader.this.currentResource.getPath()) + "#" + TeiReader.this.currentTeiElementNumber;
                    return;
                } else {
                    this.documentId = TeiReader.this.currentResource.getPath() + "#" + TeiReader.this.currentTeiElementNumber;
                    return;
                }
            }
            if (!this.inTextElement && TeiConstants.TAG_TITLE.equals(str3)) {
                this.captureText = true;
                return;
            }
            if ("text".equals(str3)) {
                this.captureText = true;
                this.inTextElement = true;
                return;
            }
            if (this.inTextElement && ("s".equals(str3) || (TeiReader.this.utterancesAsSentences && TeiConstants.TAG_U.equals(str3)))) {
                this.sentenceStart = getBuffer().length();
                return;
            }
            if (this.inTextElement && "p".equals(str3)) {
                this.paragraphStart = getBuffer().length();
                return;
            }
            if (TeiReader.this.readNamedEntity && this.inTextElement && TeiConstants.TAG_RS.equals(str3)) {
                NamedEntity namedEntity = new NamedEntity(getJCas());
                namedEntity.setBegin(getBuffer().length());
                namedEntity.setValue(attributes.getValue("type"));
                this.namedEntities.push(namedEntity);
                return;
            }
            if (TeiReader.this.readConstituent && this.inTextElement && TeiConstants.TAG_PHRASE.equals(str3)) {
                if (this.constituents.isEmpty()) {
                    ROOT root = new ROOT(getJCas());
                    root.setBegin(getBuffer().length());
                    root.setConstituentType(RootLabels.DEFAULT_ROOTSYMBOL);
                    this.constituents.push(new ConstituentWrapper(root));
                }
                Constituent constituent = new Constituent(getJCas());
                constituent.setBegin(getBuffer().length());
                constituent.setConstituentType(attributes.getValue("type"));
                constituent.setSyntacticFunction(attributes.getValue("function"));
                this.constituents.push(new ConstituentWrapper(constituent));
                return;
            }
            if (this.inTextElement) {
                if (TeiConstants.TAG_WORD.equals(str3) || "c".equals(str3) || TeiConstants.TAG_MULTIWORD.equals(str3)) {
                    this.tokenStart = getBuffer().length();
                    if (StringUtils.isNotEmpty(attributes.getValue("pos"))) {
                        this.posTag = attributes.getValue("pos");
                    } else {
                        this.posTag = attributes.getValue("type");
                    }
                    this.lemma = attributes.getValue("lemma");
                }
            }
        }

        @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
        public void endElement(String str, String str2, String str3) throws SAXException {
            if (!this.inTextElement && TeiConstants.TAG_TITLE.equals(str3)) {
                DocumentMetaData documentMetaData = DocumentMetaData.get(getJCas());
                if (!this.titleSet) {
                    documentMetaData.setDocumentTitle(getBuffer().toString().trim());
                    this.titleSet = true;
                }
                documentMetaData.setDocumentId(this.documentId);
                getBuffer().setLength(0);
                this.captureText = false;
                return;
            }
            if ("text".equals(str3)) {
                this.captureText = false;
                this.inTextElement = false;
                return;
            }
            if (this.inTextElement && ("s".equals(str3) || (TeiReader.this.utterancesAsSentences && TeiConstants.TAG_U.equals(str3)))) {
                if (TeiReader.this.readSentence) {
                    new Sentence(getJCas(), this.sentenceStart, getBuffer().length()).addToIndexes();
                }
                this.sentenceStart = -1;
                return;
            }
            if (this.inTextElement && "p".equals(str3)) {
                if (TeiReader.this.readParagraph) {
                    new Paragraph(getJCas(), this.paragraphStart, getBuffer().length()).addToIndexes();
                }
                this.paragraphStart = -1;
                return;
            }
            if (TeiReader.this.readNamedEntity && this.inTextElement && TeiConstants.TAG_RS.equals(str3)) {
                NamedEntity pop = this.namedEntities.pop();
                pop.setEnd(getBuffer().length());
                pop.addToIndexes();
                return;
            }
            if (TeiReader.this.readConstituent && this.inTextElement && TeiConstants.TAG_PHRASE.equals(str3)) {
                ConstituentWrapper pop2 = this.constituents.pop();
                pop2.constituent.setEnd(getBuffer().length());
                if (!this.constituents.isEmpty()) {
                    ConstituentWrapper peek = this.constituents.peek();
                    pop2.constituent.setParent(peek.constituent);
                    peek.children.add(pop2.constituent);
                }
                pop2.constituent.setChildren(FSCollectionFactory.createFSArray(getJCas(), pop2.children));
                pop2.constituent.addToIndexes();
                if (this.constituents.peek().constituent instanceof ROOT) {
                    ConstituentWrapper pop3 = this.constituents.pop();
                    pop3.constituent.setEnd(getBuffer().length());
                    pop3.constituent.setChildren(FSCollectionFactory.createFSArray(getJCas(), pop3.children));
                    pop3.constituent.addToIndexes();
                    return;
                }
                return;
            }
            if (this.inTextElement) {
                if (TeiConstants.TAG_WORD.equals(str3) || "c".equals(str3) || TeiConstants.TAG_MULTIWORD.equals(str3)) {
                    if (StringUtils.isNotBlank(getBuffer().substring(this.tokenStart, getBuffer().length()))) {
                        Token token = new Token(getJCas(), this.tokenStart, getBuffer().length());
                        trim(token);
                        if (this.posTag != null && TeiReader.this.readPOS) {
                            POS pos = (POS) getJCas().getCas().createAnnotation(TeiReader.this.posMappingProvider.getTagType(this.posTag), token.getBegin(), token.getEnd());
                            pos.setPosValue(this.posTag);
                            pos.addToIndexes();
                            token.setPos(pos);
                        }
                        if (this.lemma != null && TeiReader.this.readLemma) {
                            Lemma lemma = new Lemma(getJCas(), token.getBegin(), token.getEnd());
                            lemma.setValue(this.lemma);
                            lemma.addToIndexes();
                            token.setLemma(lemma);
                        }
                        if (TeiReader.this.readToken) {
                            if (!this.constituents.isEmpty()) {
                                ConstituentWrapper peek2 = this.constituents.peek();
                                token.setParent(peek2.constituent);
                                peek2.children.add(token);
                            }
                            token.addToIndexes();
                        }
                    }
                    this.tokenStart = -1;
                }
            }
        }

        @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
        public void characters(char[] cArr, int i, int i2) throws SAXException {
            if (this.captureText) {
                this.buffer.append(cArr, i, i2);
            }
        }

        @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
        public void ignorableWhitespace(char[] cArr, int i, int i2) throws SAXException {
            if (!this.captureText || TeiReader.this.omitIgnorableWhitespace) {
                return;
            }
            this.buffer.append(cArr, i, i2);
        }

        private void trim(Annotation annotation) {
            StringBuilder buffer = getBuffer();
            int begin = annotation.getBegin();
            int end = annotation.getEnd();
            while (Character.isWhitespace(buffer.charAt(begin))) {
                begin++;
            }
            while (end > begin + 1 && Character.isWhitespace(buffer.charAt(end - 1))) {
                end--;
            }
            annotation.setBegin(begin);
            annotation.setEnd(end);
        }

        @Override // de.tudarmstadt.ukp.dkpro.core.io.tei.TeiReader.Handler
        public /* bridge */ /* synthetic */ Logger getLogger() {
            return super.getLogger();
        }

        @Override // de.tudarmstadt.ukp.dkpro.core.io.tei.TeiReader.Handler
        public /* bridge */ /* synthetic */ void setLogger(Logger logger) {
            super.setLogger(logger);
        }

        @Override // de.tudarmstadt.ukp.dkpro.core.io.tei.TeiReader.Handler
        public /* bridge */ /* synthetic */ void setJCas(JCas jCas) {
            super.setJCas(jCas);
        }
    }

    @Override // de.tudarmstadt.ukp.dkpro.core.api.io.ResourceCollectionReaderBase, org.apache.uima.fit.component.CasCollectionReader_ImplBase
    public void initialize(UimaContext uimaContext) throws ResourceInitializationException {
        super.initialize(uimaContext);
        if (this.readPOS && !this.readToken) {
            throw new ResourceInitializationException(new IllegalArgumentException("Setting readPOS to 'true' requires writeToken to be 'true' too."));
        }
        try {
            this.teiElementIterator = Arrays.asList(new Element[0]).iterator();
            nextTeiElement();
        } catch (IOException e) {
            new ResourceInitializationException(e);
        } catch (CollectionException e2) {
            new ResourceInitializationException(e2);
        }
        this.posMappingProvider = MappingProviderFactory.createPosMappingProvider(this.mappingPosLocation, this.posTagset, getLanguage());
    }

    private void nextTeiElement() throws CollectionException, IOException {
        if (this.teiElementIterator == null) {
            this.currentTeiElement = null;
            return;
        }
        while (!this.teiElementIterator.hasNext() && super.hasNext()) {
            this.currentResource = nextFile();
            InputStream inputStream = null;
            try {
                try {
                    try {
                        inputStream = this.currentResource.getInputStream();
                        if (this.currentResource.getPath().endsWith(".gz")) {
                            inputStream = new GZIPInputStream(inputStream);
                        }
                        InputSource inputSource = new InputSource(inputStream);
                        inputSource.setPublicId(this.currentResource.getLocation());
                        inputSource.setSystemId(this.currentResource.getLocation());
                        Document read = new SAXReader().read(inputSource);
                        Dom4jXPath dom4jXPath = new Dom4jXPath("//tei:TEI");
                        dom4jXPath.addNamespace("tei", TeiConstants.TEI_NS);
                        this.teiElementIterator = dom4jXPath.selectNodes(read).iterator();
                        this.currentTeiElementNumber = 0;
                        IOUtils.closeQuietly(inputStream);
                    } catch (JaxenException e) {
                        throw new IOException(e);
                    }
                } catch (DocumentException e2) {
                    throw new IOException(e2);
                }
            } catch (Throwable th) {
                IOUtils.closeQuietly(inputStream);
                throw th;
            }
        }
        this.currentTeiElement = this.teiElementIterator.hasNext() ? this.teiElementIterator.next() : null;
        this.currentTeiElementNumber++;
        if (super.hasNext() || this.teiElementIterator.hasNext()) {
            return;
        }
        this.teiElementIterator = null;
    }

    @Override // de.tudarmstadt.ukp.dkpro.core.api.io.ResourceCollectionReaderBase, org.apache.uima.collection.base_cpm.BaseCollectionReader
    public boolean hasNext() throws IOException, CollectionException {
        return (this.teiElementIterator == null && this.currentTeiElement == null) ? false : true;
    }

    @Override // org.apache.uima.collection.CollectionReader
    public void getNext(CAS cas) throws IOException, CollectionException {
        initCas(cas, this.currentResource);
        if (getConfigParameterValue("language") != null) {
            cas.setDocumentLanguage((String) getConfigParameterValue("language"));
        }
        try {
            this.posMappingProvider.configure(cas);
            try {
                try {
                    try {
                        JCas jCas = cas.getJCas();
                        Handler newSaxHandler = newSaxHandler();
                        newSaxHandler.setJCas(jCas);
                        newSaxHandler.setLogger(getLogger());
                        new SAXWriter(newSaxHandler).write(this.currentTeiElement);
                        newSaxHandler.endDocument();
                        IOUtils.closeQuietly((InputStream) null);
                        nextTeiElement();
                    } catch (CASException e) {
                        throw new CollectionException(e);
                    }
                } catch (SAXException e2) {
                    throw new IOException(e2);
                }
            } catch (Throwable th) {
                IOUtils.closeQuietly((InputStream) null);
                throw th;
            }
        } catch (AnalysisEngineProcessException e3) {
            throw new IOException(e3);
        }
    }

    protected Handler newSaxHandler() {
        return new TeiHandler();
    }
}
