package de.tudarmstadt.ukp.dkpro.core.io.penntree;

import com.clearnlp.dependency.srl.SRLLib;
import de.tudarmstadt.ukp.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase;
import de.tudarmstadt.ukp.dkpro.core.api.io.ResourceCollectionReaderBase;
import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS;
import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider;
import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProviderFactory;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.io.IOUtils;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.ResourceMetaData;
import org.apache.uima.fit.descriptor.TypeCapability;
import org.apache.uima.fit.internal.EnhancedClassFile;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;

@TypeCapability(outputs = {"de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS", "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.chunk.Chunk"})
@ResourceMetaData(name = "de.tudarmstadt.ukp.dkpro.core.io.penntree.PennTreebankChunkedReader", description = "Penn Treebank chunked format reader.", version = "1.8.0", vendor = "DKPro Core Project", copyright = "Copyright 2010-2015\n                            Ubiquitous Knowledge Processing (UKP) Lab\n                            Technische Universität Darmstadt")
@EnhancedClassFile
/* loaded from: input_file:de/tudarmstadt/ukp/dkpro/core/io/penntree/PennTreebankChunkedReader.class */
public class PennTreebankChunkedReader extends JCasResourceCollectionReader_ImplBase {
    public static final String PARAM_POS_MAPPING_LOCATION = "POSMappingLocation";

    @ConfigurationParameter(name = "POSMappingLocation", mandatory = false, description = "Location of the mapping file for part-of-speech tags to UIMA types.")
    protected String posMappingLocation;
    public static final String PARAM_READ_TOKEN = "readToken";

    @ConfigurationParameter(name = "readToken", mandatory = true, defaultValue = {"true"}, description = "Write token annotations to the CAS.")
    private boolean readToken;
    public static final String PARAM_READ_POS = "readPOS";

    @ConfigurationParameter(name = "readPOS", mandatory = true, defaultValue = {"true"}, description = "Write part-of-speech annotations to the CAS.")
    private boolean readPOS;
    public static final String PARAM_READ_SENTENCE = "readSentence";

    @ConfigurationParameter(name = "readSentence", mandatory = true, defaultValue = {"true"}, description = "Write sentence annotations to the CAS.")
    private boolean readSentence;
    public static final String PARAM_READ_CHUNK = "readChunk";

    @ConfigurationParameter(name = "readChunk", mandatory = true, defaultValue = {"true"}, description = "Write chunk annotations to the CAS.")
    private boolean readChunk;
    public static final String PARAM_POS_TAG_SET = "POSTagSet";

    @ConfigurationParameter(name = "POSTagSet", mandatory = false, description = "Use this part-of-speech tag set to use to resolve the tag set mapping instead of using the\ntag set defined as part of the model meta data. This can be useful if a custom model is\nspecified which does not have such meta data, or it can be used in readers.")
    protected String posTagset;
    public static final String PARAM_SOURCE_ENCODING = "sourceEncoding";

    @ConfigurationParameter(name = "sourceEncoding", mandatory = true, defaultValue = {"UTF-8"}, description = "Character encoding of the input data.")
    protected String encoding;
    public static final String ENCODING_AUTO = "auto";
    private MappingProvider posMappingProvider;

    @Override // de.tudarmstadt.ukp.dkpro.core.api.io.ResourceCollectionReaderBase, org.apache.uima.fit.component.CasCollectionReader_ImplBase
    public void initialize(UimaContext uimaContext) throws ResourceInitializationException {
        super.initialize(uimaContext);
        this.posMappingProvider = MappingProviderFactory.createPosMappingProvider(this.posMappingLocation, this.posTagset, getLanguage());
    }

    @Override // de.tudarmstadt.ukp.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase
    public void getNext(JCas jCas) throws IOException, CollectionException {
        ResourceCollectionReaderBase.Resource nextFile = nextFile();
        initCas(jCas, nextFile);
        jCas.setDocumentLanguage((String) getConfigParameterValue("language"));
        try {
            this.posMappingProvider.configure(jCas.getCas());
            ArrayList arrayList = new ArrayList();
            ArrayList arrayList2 = new ArrayList();
            ArrayList arrayList3 = new ArrayList();
            BufferedReader bufferedReader = null;
            try {
                bufferedReader = new BufferedReader(new InputStreamReader(nextFile.getInputStream(), this.encoding));
                while (true) {
                    String readLine = bufferedReader.readLine();
                    if (readLine == null) {
                        break;
                    }
                    if (!lineIsTrash(readLine)) {
                        String replaceAll = readLine.trim().replaceAll("[ ]{2,}", " ");
                        int[] iArr = null;
                        if (replaceAll.startsWith("[") && replaceAll.endsWith("]")) {
                            iArr = new int[]{arrayList.size()};
                            replaceAll = replaceAll.replaceAll("\\[", "").replaceAll("\\]", "").trim();
                        }
                        for (String str : replaceAll.split(" ")) {
                            String[] splitWordsAndTagAndNormalizeEscapedSlash = wordsAreConnectedByForwardSlash(str) ? splitWordsAndTagAndNormalizeEscapedSlash(str) : str.split("/");
                            if (splitWordsAndTagAndNormalizeEscapedSlash == null) {
                                getLogger().error("After splitting token from tag value became NULL, skipping this token");
                            } else if (splitWordsAndTagAndNormalizeEscapedSlash.length >= 2) {
                                String str2 = splitWordsAndTagAndNormalizeEscapedSlash[0];
                                String ifWordIsMisspelledSelectTagThatFitsTheMisspelledWord = ifWordIsMisspelledSelectTagThatFitsTheMisspelledWord(selectFirstTagIfTokenIsAmbiguousInContextAndSeveralAcceptableOnesExist(splitWordsAndTagAndNormalizeEscapedSlash[1]));
                                arrayList.add(str2);
                                arrayList2.add(ifWordIsMisspelledSelectTagThatFitsTheMisspelledWord);
                            }
                        }
                        if (iArr != null) {
                            iArr[1] = arrayList.size() - 1;
                            arrayList3.add(iArr);
                        }
                    }
                }
                IOUtils.closeQuietly((Reader) bufferedReader);
                jCas.setDocumentText(annotateSenenceTokenPosTypes(jCas, arrayList, arrayList2));
                if (this.readChunk) {
                    annotateChunks(jCas, arrayList3);
                }
            } catch (Throwable th) {
                IOUtils.closeQuietly((Reader) bufferedReader);
                throw th;
            }
        } catch (AnalysisEngineProcessException e) {
            throw new IOException(e);
        }
    }

    private void annotateChunks(JCas jCas, List<int[]> list) {
        if (this.readToken) {
            ArrayList arrayList = new ArrayList(JCasUtil.select(jCas, Token.class));
            for (int[] iArr : list) {
                new Chunk(jCas, ((Token) arrayList.get(iArr[0])).getBegin(), ((Token) arrayList.get(iArr[1])).getEnd()).addToIndexes();
            }
        }
    }

    private String ifWordIsMisspelledSelectTagThatFitsTheMisspelledWord(String str) {
        return str.contains(SRLLib.DELIM_PATH_UP) ? str.replaceAll("\\^", " ").trim().split(" ")[0] : str;
    }

    private boolean lineIsTrash(String str) {
        return str.startsWith("=========") || str.startsWith("*x*") || str.isEmpty();
    }

    private String selectFirstTagIfTokenIsAmbiguousInContextAndSeveralAcceptableOnesExist(String str) {
        return str.split("\\|")[0];
    }

    private String[] splitWordsAndTagAndNormalizeEscapedSlash(String str) {
        int lastIndexOf = str.lastIndexOf("/");
        if (lastIndexOf < 0) {
            return null;
        }
        String[] strArr = new String[2];
        strArr[0] = str.substring(0, lastIndexOf);
        strArr[0] = strArr[0].replaceAll("\\\\/", "/");
        strArr[1] = str.substring(lastIndexOf + 1);
        return strArr;
    }

    private boolean wordsAreConnectedByForwardSlash(String str) {
        return str.contains("\\/");
    }

    private String annotateSenenceTokenPosTypes(JCas jCas, List<String> list, List<String> list2) {
        StringBuilder sb = new StringBuilder();
        int i = 0;
        for (int i2 = 0; i2 < list.size(); i2++) {
            String str = list.get(i2);
            String str2 = list2.get(i2);
            annotateTokenWithTag(jCas, str, str2, sb.length());
            sb.append(str);
            sb.append(" ");
            if (str2.equals(".")) {
                String trim = sb.toString().trim();
                if (this.readSentence) {
                    annotateSentence(jCas, i, trim.length());
                }
                i = sb.length();
            }
        }
        return sb.toString().trim();
    }

    private void annotateSentence(JCas jCas, int i, int i2) {
        new Sentence(jCas, i, i2).addToIndexes();
    }

    private void annotateTokenWithTag(JCas jCas, String str, String str2, int i) {
        if (this.readToken) {
            Token token = new Token(jCas, i, str.length() + i);
            token.addToIndexes();
            if (this.readPOS) {
                POS pos = (POS) jCas.getCas().createAnnotation(this.posMappingProvider.getTagType(str2), token.getBegin(), token.getEnd());
                pos.setPosValue(str2);
                pos.addToIndexes();
                token.setPos(pos);
            }
        }
    }
}
