package de.tudarmstadt.ukp.dkpro.core.tokit;

import de.tudarmstadt.ukp.dkpro.core.api.segmentation.SegmenterBase;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.ResourceMetaData;
import org.apache.uima.fit.internal.EnhancedClassFile;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;

@ResourceMetaData(name = "de.tudarmstadt.ukp.dkpro.core.tokit.RegexTokenizer", description = "This segmenter splits sentences and tokens based on regular expressions that define the sentence\nand token boundaries.\n<p>\nThe default behaviour is to split sentences by a line break and tokens by whitespace.", version = "1.8.0", vendor = "DKPro Core Project", copyright = "Copyright 2010-2015\n                            Ubiquitous Knowledge Processing (UKP) Lab\n                            Technische Universität Darmstadt")
@EnhancedClassFile
/* loaded from: input_file:de/tudarmstadt/ukp/dkpro/core/tokit/RegexTokenizer.class */
public class RegexTokenizer extends SegmenterBase {
    private static final String LINEBREAK_PATTERN = "\n";
    private static final String WHITESPACE_PATTERN = "[\\s\n]+";
    public static final String PARAM_TOKEN_BOUNDARY_REGEX = "tokenBoundaryRegex";

    @ConfigurationParameter(name = PARAM_TOKEN_BOUNDARY_REGEX, mandatory = true, defaultValue = {WHITESPACE_PATTERN}, description = "Defines the pattern that is used as token end boundary. Default: [\\s\\n]+ (matching\nwhitespace and linebreaks.\n<p>\nWhen setting custom patterns, take into account that the final token is often terminated by a\nlinebreak rather than the boundary character. Therefore, the newline typically has to be\nadded to the group of matching characters, e.g. \"tokenized-text\" is correctly\ntokenized with the pattern [-\\n].")
    private String tokenBoundaryRegex;
    private Pattern tokenBoundaryPattern;
    public static final String PARAM_SENTENCE_BOUNDARY_REGEX = "sentenceBoundaryRegex";

    @ConfigurationParameter(name = PARAM_SENTENCE_BOUNDARY_REGEX, mandatory = true, defaultValue = {"\n"}, description = "Define the sentence boundary. Default: \\n (assume one sentence per line).")
    private String sentenceBoundaryRegex;
    private Pattern sentenceBoundaryPattern;

    @Override // org.apache.uima.fit.component.JCasAnnotator_ImplBase, org.apache.uima.analysis_component.AnalysisComponent_ImplBase, org.apache.uima.analysis_component.AnalysisComponent
    public void initialize(UimaContext uimaContext) throws ResourceInitializationException {
        super.initialize(uimaContext);
        this.tokenBoundaryPattern = Pattern.compile(this.tokenBoundaryRegex);
        this.sentenceBoundaryPattern = Pattern.compile(this.sentenceBoundaryRegex);
    }

    @Override // de.tudarmstadt.ukp.dkpro.core.api.segmentation.SegmenterBase
    protected void process(JCas jCas, String str, int i) throws AnalysisEngineProcessException {
        String str2 = str.endsWith("\n") ? str : str + "\n";
        if (isWriteSentence()) {
            createSentences(jCas, str2);
        }
        createTokens(jCas, str2);
    }

    private void createSentences(JCas jCas, String str) {
        Matcher matcher = this.sentenceBoundaryPattern.matcher(str);
        int i = 0;
        while (true) {
            int i2 = i;
            if (!matcher.find()) {
                return;
            }
            new Sentence(jCas, i2, matcher.start()).addToIndexes(jCas);
            i = matcher.end();
        }
    }

    private void createTokens(JCas jCas, String str) {
        Matcher matcher = this.tokenBoundaryPattern.matcher(str);
        int i = 0;
        while (true) {
            int i2 = i;
            if (!matcher.find()) {
                return;
            }
            new Token(jCas, i2, matcher.start()).addToIndexes(jCas);
            i = matcher.end();
        }
    }
}
