package de.tudarmstadt.ukp.dkpro.core.io.conll;

import de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceChain;
import de.tudarmstadt.ukp.dkpro.core.api.coref.type.CoreferenceLink;
import de.tudarmstadt.ukp.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase;
import de.tudarmstadt.ukp.dkpro.core.api.io.ResourceCollectionReaderBase;
import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS;
import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData;
import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity;
import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider;
import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemanticArgument;
import de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemanticPredicate;
import de.tudarmstadt.ukp.dkpro.core.api.semantics.type.WordSense;
import de.tudarmstadt.ukp.dkpro.core.io.penntree.PennTreeToJCasConverter;
import de.tudarmstadt.ukp.dkpro.core.io.penntree.PennTreeUtils;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.ResourceMetaData;
import org.apache.uima.fit.descriptor.TypeCapability;
import org.apache.uima.fit.factory.JCasBuilder;
import org.apache.uima.fit.internal.EnhancedClassFile;
import org.apache.uima.fit.util.FSCollectionFactory;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;

@TypeCapability(outputs = {"de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma", "de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemanticPredicate", "de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemanticArgument"})
@ResourceMetaData(name = "de.tudarmstadt.ukp.dkpro.core.io.conll.Conll2012Reader", description = "<p>Reads a file in the CoNLL-2009 format.</p>\n\n<ol>\n<li>Document ID - <b>(ignored)</b> This is a variation on the document filename.</li>\n<li>Part number - <b>(ignored)</b> Some files are divided into multiple parts numbered as 000,\n001, 002, ... etc.</li>\n<li>Word number - <b>(ignored)</b></li>\n<li>Word itself - <b>(document text)</b> This is the token as segmented/tokenized in the\nTreebank. Initially the *_skel file contain the placeholder [WORD] which gets replaced by the\nactual token from the Treebank which is part of the OntoNotes release.</li>\n<li>Part-of-Speech - <b>(POS)</b></li>\n<li>Parse bit - <b>(Constituent)</b> This is the bracketed structure broken before the first open\nparenthesis in the parse, and the word/part-of-speech leaf replaced with a *. The full parse can\nbe created by substituting the asterix with the \"([pos] [word])\" string (or leaf) and\nconcatenating the items in the rows of that column.</li>\n<li>Predicate lemma - <b>(Lemma)</b> The predicate lemma is mentioned for the rows for which we\nhave semantic role information. All other rows are marked with a \"-\"</li>\n<li>Predicate Frameset ID - <b>(SemanticPredicate)</b> This is the PropBank frameset ID of the\npredicate in Column 7.</li>\n<li>Word sense - <b>(ignored)</b> This is the word sense of the word in Column 3.</li>\n<li>Speaker/Author - <b>(ignored)</b> This is the speaker or author name where available. Mostly\nin Broadcast Conversation and Web Log data.</li>\n<li>Named Entities - <b>(NamedEntity)</b> These columns identifies the spans representing various\nnamed entities.</li>\n<li>Predicate Arguments - <b>(SemanticPredicate)</b> There is one column each of predicate\nargument structure information for the predicate mentioned in Column 7.</li>\n<li>Coreference - <b>(CoreferenceChain)</b> Coreference chain information encoded in a\nparenthesis structure.</li>\n</ol>\n\n<p>Sentences are separated by a blank new line.</p>", version = "1.8.0", vendor = "DKPro Core Project", copyright = "Copyright 2010-2015\n                            Ubiquitous Knowledge Processing (UKP) Lab\n                            Technische Universität Darmstadt")
@EnhancedClassFile
/* loaded from: input_file:de/tudarmstadt/ukp/dkpro/core/io/conll/Conll2012Reader.class */
public class Conll2012Reader extends JCasResourceCollectionReader_ImplBase {
    public static final String PARAM_ENCODING = "sourceEncoding";

    @ConfigurationParameter(name = "sourceEncoding", mandatory = true, defaultValue = {"UTF-8"})
    private String encoding;
    public static final String PARAM_READ_POS = "readPOS";

    @ConfigurationParameter(name = "readPOS", mandatory = true, defaultValue = {"true"})
    private boolean readPos;
    public static final String PARAM_POS_TAG_SET = "POSTagSet";

    @ConfigurationParameter(name = "POSTagSet", mandatory = false, description = "Use this part-of-speech tag set to use to resolve the tag set mapping instead of using the\ntag set defined as part of the model meta data. This can be useful if a custom model is\nspecified which does not have such meta data, or it can be used in readers.")
    protected String posTagset;
    public static final String PARAM_POS_MAPPING_LOCATION = "POSMappingLocation";

    @ConfigurationParameter(name = "POSMappingLocation", mandatory = false, description = "Load the part-of-speech tag to UIMA type mapping from this location instead of locating\nthe mapping automatically.")
    protected String posMappingLocation;
    public static final String PARAM_READ_LEMMA = "readLemma";

    @ConfigurationParameter(name = "readLemma", mandatory = true, defaultValue = {"false"}, description = "Disabled by default because CoNLL 2012 format does not include lemmata for all words, only\nfor predicates.")
    private boolean readLemma;
    public static final String PARAM_READ_SEMANTIC_PREDICATE = "readSemanticPredicate";

    @ConfigurationParameter(name = "readSemanticPredicate", mandatory = true, defaultValue = {"true"})
    private boolean readSemanticPredicate;
    public static final String PARAM_READ_WORD_SENSE = "readWordSense";

    @ConfigurationParameter(name = PARAM_READ_WORD_SENSE, mandatory = true, defaultValue = {"true"})
    private boolean readWordSense;
    public static final String PARAM_READ_CONSTITUENT = "readConstituent";

    @ConfigurationParameter(name = "readConstituent", mandatory = true, defaultValue = {"true"})
    private boolean readConstituent;
    public static final String PARAM_READ_COREFERENCE = "readCoreference";

    @ConfigurationParameter(name = "readCoreference", mandatory = true, defaultValue = {"true"})
    private boolean readCoreference;
    public static final String PARAM_READ_NAMED_ENTITY = "readNamedEntity";

    @ConfigurationParameter(name = "readNamedEntity", mandatory = true, defaultValue = {"true"})
    private boolean readNamedEntity;
    public static final String PARAM_CONSTITUENT_TAG_SET = "ConstituentTagSet";

    @ConfigurationParameter(name = "ConstituentTagSet", mandatory = false, description = "Use this constituent tag set to use to resolve the tag set mapping instead of using the\ntag set defined as part of the model meta data. This can be useful if a custom model is\nspecified which does not have such meta data, or it can be used in readers.")
    protected String constituentTagset;
    public static final String PARAM_CONSTITUENT_MAPPING_LOCATION = "ConstituentMappingLocation";

    @ConfigurationParameter(name = "ConstituentMappingLocation", mandatory = false, description = "Load the constituent tag to UIMA type mapping from this location instead of locating\nthe mapping automatically.")
    protected String constituentMappingLocation;
    public static final String PARAM_INTERN_TAGS = "internTags";

    @ConfigurationParameter(name = "internTags", mandatory = false, defaultValue = {"true"}, description = "Use the String#intern() method on tags. This is usually a good idea to avoid\nspaming the heap with thousands of strings representing only a few different tags.\n\nDefault: true")
    private boolean internTags;
    public static final String PARAM_WRITE_TRACES_TO_TEXT = "writeTracesToText";

    @ConfigurationParameter(name = "writeTracesToText", mandatory = false, defaultValue = {"false"})
    private boolean writeTracesToText;
    public static final String PARAM_USE_HEADER_METADATA = "useHeaderMetadata";

    @ConfigurationParameter(name = PARAM_USE_HEADER_METADATA, mandatory = true, defaultValue = {"true"}, description = "Use the document ID declared in the file header instead of using the filename.")
    private boolean useHeaderMetadata;
    private static final String UNUSED = "-";
    private static final int ID = 2;
    private static final int FORM = 3;
    private static final int POS = 4;
    private static final int PARSE = 5;
    private static final int LEMMA = 6;
    private static final int PRED = 7;
    private static final int WORD_SENSE = 8;
    private static final int NAMED_ENTITIES = 10;
    private static final int APRED = 11;
    private MappingProvider posMappingProvider;
    private MappingProvider constituentMappingProvider;
    private PennTreeToJCasConverter converter;

    @Override // de.tudarmstadt.ukp.dkpro.core.api.io.ResourceCollectionReaderBase, org.apache.uima.fit.component.CasCollectionReader_ImplBase
    public void initialize(UimaContext uimaContext) throws ResourceInitializationException {
        super.initialize(uimaContext);
        this.posMappingProvider = MappingProviderFactory.createPosMappingProvider(this.posMappingLocation, this.posTagset, getLanguage());
        this.constituentMappingProvider = MappingProviderFactory.createConstituentMappingProvider(this.constituentMappingLocation, this.constituentTagset, getLanguage());
        this.converter = new PennTreeToJCasConverter(this.posMappingProvider, this.constituentMappingProvider);
        this.converter.setInternTags(this.internTags);
        this.converter.setWriteTracesToText(this.writeTracesToText);
        this.converter.setCreatePosTags(false);
        this.converter.setRootLabel("TOP");
    }

    @Override // de.tudarmstadt.ukp.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase
    public void getNext(JCas jCas) throws IOException, CollectionException {
        ResourceCollectionReaderBase.Resource nextFile = nextFile();
        initCas(jCas, nextFile);
        BufferedReader bufferedReader = null;
        try {
            bufferedReader = new BufferedReader(new InputStreamReader(nextFile.getInputStream(), this.encoding));
            convert(jCas, bufferedReader);
            IOUtils.closeQuietly((Reader) bufferedReader);
        } catch (Throwable th) {
            IOUtils.closeQuietly((Reader) bufferedReader);
            throw th;
        }
    }

    public void convert(JCas jCas, BufferedReader bufferedReader) throws IOException {
        try {
            if (this.readPos) {
                this.posMappingProvider.configure(jCas.getCas());
            }
            if (this.readConstituent) {
                this.constituentMappingProvider.configure(jCas.getCas());
            }
            HashMap hashMap = new HashMap();
            JCasBuilder jCasBuilder = new JCasBuilder(jCas);
            while (true) {
                List<String[]> readSentence = readSentence(jCas, bufferedReader);
                if (readSentence == null) {
                    jCasBuilder.close();
                    return;
                }
                if (!readSentence.isEmpty()) {
                    int position = jCasBuilder.getPosition();
                    int i = position;
                    StringBuilder sb = new StringBuilder();
                    HashMap hashMap2 = new HashMap();
                    ArrayList arrayList = new ArrayList();
                    for (String[] strArr : readSentence) {
                        Token token = (Token) jCasBuilder.add(strArr[3], Token.class);
                        hashMap2.put(Integer.valueOf(strArr[2]), token);
                        jCasBuilder.add(" ");
                        if (!"-".equals(strArr[6]) && this.readLemma) {
                            Lemma lemma = new Lemma(jCas, token.getBegin(), token.getEnd());
                            lemma.setValue(strArr[6]);
                            lemma.addToIndexes();
                            token.setLemma(lemma);
                        }
                        if (!"-".equals(strArr[4]) && this.readPos) {
                            POS pos = (POS) jCas.getCas().createAnnotation(this.posMappingProvider.getTagType(strArr[4]), token.getBegin(), token.getEnd());
                            pos.setPosValue(strArr[4]);
                            pos.addToIndexes();
                            token.setPos(pos);
                        }
                        if (!"-".equals(strArr[7]) && this.readSemanticPredicate) {
                            SemanticPredicate semanticPredicate = new SemanticPredicate(jCas, token.getBegin(), token.getEnd());
                            semanticPredicate.setCategory(strArr[7]);
                            semanticPredicate.addToIndexes();
                            arrayList.add(semanticPredicate);
                        }
                        if (!"-".equals(strArr[5]) && this.readConstituent) {
                            sb.append(strArr[5].replace("*", "(" + strArr[4] + " " + strArr[3] + ")"));
                        }
                        if (!"-".equals(strArr[8]) && this.readWordSense) {
                            WordSense wordSense = new WordSense(jCas, token.getBegin(), token.getEnd());
                            wordSense.setValue(strArr[8]);
                            wordSense.addToIndexes();
                        }
                        if (!"-".equals(strArr[strArr.length - 1]) && this.readCoreference) {
                            for (String str : strArr[strArr.length - 1].split("\\|")) {
                                boolean startsWith = str.startsWith("(");
                                boolean endsWith = str.endsWith(")");
                                String substring = str.substring(startsWith ? 1 : 0, endsWith ? str.length() - 1 : str.length());
                                CoreferenceLink coreferenceLink = (CoreferenceLink) hashMap.get(substring);
                                if (startsWith) {
                                    if (coreferenceLink == null) {
                                        coreferenceLink = new CoreferenceLink(jCas);
                                        CoreferenceChain coreferenceChain = new CoreferenceChain(jCas);
                                        coreferenceChain.setFirst(coreferenceLink);
                                        coreferenceChain.addToIndexes();
                                    } else {
                                        CoreferenceLink coreferenceLink2 = new CoreferenceLink(jCas);
                                        coreferenceLink.setNext(coreferenceLink2);
                                        coreferenceLink = coreferenceLink2;
                                    }
                                    coreferenceLink.setReferenceType(substring);
                                    coreferenceLink.setBegin(token.getBegin());
                                }
                                if (endsWith) {
                                    coreferenceLink.setEnd(token.getEnd());
                                    coreferenceLink.addToIndexes();
                                }
                                hashMap.put(substring, coreferenceLink);
                            }
                        }
                        i = token.getEnd();
                    }
                    if (this.readNamedEntity) {
                        int i2 = -1;
                        String str2 = null;
                        for (int i3 = 0; i3 < readSentence.size(); i3++) {
                            String str3 = readSentence.get(i3)[10];
                            boolean startsWith2 = str3.startsWith("(");
                            boolean endsWith2 = str3.endsWith(")");
                            if (startsWith2) {
                                str2 = str3.substring(1, str3.length() - 1);
                                i2 = i3;
                            }
                            if (endsWith2) {
                                NamedEntity namedEntity = new NamedEntity(jCas, ((Token) hashMap2.get(Integer.valueOf(i2))).getBegin(), ((Token) hashMap2.get(Integer.valueOf(i3))).getEnd());
                                namedEntity.setValue(str2);
                                namedEntity.addToIndexes();
                                i2 = -1;
                                str2 = null;
                            }
                        }
                    }
                    if (this.readSemanticPredicate) {
                        for (int i4 = 0; i4 < arrayList.size(); i4++) {
                            SemanticPredicate semanticPredicate2 = (SemanticPredicate) arrayList.get(i4);
                            ArrayList arrayList2 = new ArrayList();
                            int i5 = -1;
                            String str4 = null;
                            for (int i6 = 0; i6 < readSentence.size(); i6++) {
                                String str5 = readSentence.get(i6)[11 + i4];
                                boolean startsWith3 = str5.startsWith("(");
                                boolean endsWith3 = str5.endsWith(")");
                                if (startsWith3) {
                                    str4 = str5.substring(1, str5.length() - 1);
                                    i5 = i6;
                                }
                                if (endsWith3) {
                                    int begin = ((Token) hashMap2.get(Integer.valueOf(i5))).getBegin();
                                    int end = ((Token) hashMap2.get(Integer.valueOf(i6))).getEnd();
                                    if (semanticPredicate2.getBegin() != begin || semanticPredicate2.getEnd() != end) {
                                        SemanticArgument semanticArgument = new SemanticArgument(jCas, begin, end);
                                        semanticArgument.setRole(str4);
                                        semanticArgument.addToIndexes();
                                        arrayList2.add(semanticArgument);
                                    }
                                    i5 = -1;
                                    str4 = null;
                                }
                            }
                            semanticPredicate2.setArguments(FSCollectionFactory.createFSArray(jCas, arrayList2));
                        }
                    }
                    Sentence sentence = new Sentence(jCas, position, i);
                    sentence.addToIndexes();
                    this.converter.convertPennTree(sentence, PennTreeUtils.parsePennTree(sb.toString()));
                    jCasBuilder.add("\n");
                }
            }
        } catch (AnalysisEngineProcessException e) {
            throw new IOException(e);
        }
    }

    private List<String[]> readSentence(JCas jCas, BufferedReader bufferedReader) throws IOException {
        String readLine;
        ArrayList arrayList = new ArrayList();
        while (true) {
            readLine = bufferedReader.readLine();
            if (readLine == null || StringUtils.isBlank(readLine)) {
                break;
            }
            if (readLine.startsWith("#")) {
                if (readLine.startsWith("#begin") && this.useHeaderMetadata) {
                    Matcher matcher = Pattern.compile("^#begin document \\((.*)\\); part (\\d+)$").matcher(readLine);
                    if (matcher.matches()) {
                        DocumentMetaData.get(jCas).setDocumentId(matcher.group(1) + '#' + matcher.group(2));
                    }
                }
            } else {
                if (readLine.startsWith("<")) {
                    break;
                }
                arrayList.add(readLine.split("\\s+"));
            }
        }
        if (readLine == null && arrayList.isEmpty()) {
            return null;
        }
        return arrayList;
    }
}
