/*
 * Decompiled with CFR 0.152.
 */
package edu.stanford.nlp.process;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.HasTag;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.Label;
import edu.stanford.nlp.objectbank.XMLBeginEndIterator;
import edu.stanford.nlp.process.CoreLabelTokenFactory;
import edu.stanford.nlp.process.PTBTokenizer;
import edu.stanford.nlp.process.Tokenizer;
import edu.stanford.nlp.process.TokenizerFactory;
import edu.stanford.nlp.process.WhitespaceTokenizer;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.PropertiesUtils;
import edu.stanford.nlp.util.StringUtils;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.Reader;
import java.io.StringReader;
import java.io.Writer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Properties;
import java.util.Set;
import java.util.function.Function;
import java.util.regex.Pattern;

public class DocumentPreprocessor
implements Iterable<List<HasWord>> {
    public static final String[] DEFAULT_SENTENCE_DELIMS = new String[]{".", "?", "!", "!!", "!!!", "??", "?!", "!?"};
    private Reader inputReader;
    private final DocType docType;
    private TokenizerFactory<? extends HasWord> tokenizerFactory = PTBTokenizer.coreLabelFactory();
    private String[] sentenceFinalPuncWords = DEFAULT_SENTENCE_DELIMS;
    private Function<List<HasWord>, List<HasWord>> escaper;
    private String sentenceDelimiter;
    private String tagDelimiter;
    private String elementDelimiter = ".*";
    private static final Pattern wsPattern = Pattern.compile("\\s+");
    private final String[] sentenceFinalFollowers = new String[]{")", "]", "\"", "'", "''", "-RRB-", "-RSB-", "-RCB-"};
    private boolean keepEmptySentences;

    public DocumentPreprocessor(Reader input) {
        this(input, DocType.Plain);
    }

    public DocumentPreprocessor(Reader input, DocType t) {
        if (input == null) {
            throw new IllegalArgumentException("Cannot read from null object!");
        }
        this.docType = t;
        this.inputReader = input;
    }

    public DocumentPreprocessor(String docPath) {
        this(docPath, DocType.Plain, "UTF-8");
    }

    public DocumentPreprocessor(String docPath, DocType t) {
        this(docPath, t, "UTF-8");
    }

    public DocumentPreprocessor(String docPath, DocType t, String encoding) {
        if (docPath == null) {
            throw new IllegalArgumentException("Cannot open null document path!");
        }
        this.docType = t;
        try {
            this.inputReader = IOUtils.readerFromString(docPath, encoding);
        }
        catch (IOException ioe) {
            throw new RuntimeIOException(String.format("%s: Could not open path %s", this.getClass().getName(), docPath), ioe);
        }
    }

    public void setKeepEmptySentences(boolean keepEmptySentences) {
        this.keepEmptySentences = keepEmptySentences;
    }

    public void setSentenceFinalPuncWords(String[] sentenceFinalPuncWords) {
        this.sentenceFinalPuncWords = sentenceFinalPuncWords;
    }

    public void setTokenizerFactory(TokenizerFactory<? extends HasWord> newTokenizerFactory) {
        this.tokenizerFactory = newTokenizerFactory;
    }

    public void setEscaper(Function<List<HasWord>, List<HasWord>> e) {
        this.escaper = e;
    }

    public void setSentenceDelimiter(String s) {
        this.sentenceDelimiter = s;
    }

    public void setTagDelimiter(String s) {
        this.tagDelimiter = s;
    }

    public void setElementDelimiter(String s) {
        this.elementDelimiter = s;
    }

    @Override
    public Iterator<List<HasWord>> iterator() {
        if (this.docType == DocType.Plain) {
            return new PlainTextIterator();
        }
        if (this.docType == DocType.XML) {
            return new XMLIterator();
        }
        throw new IllegalStateException("Someone didn't add a handler for a new docType.");
    }

    private static String usage() {
        StringBuilder sb = new StringBuilder();
        String nl = System.lineSeparator();
        sb.append(String.format("Usage: java %s [OPTIONS] [file] [< file]%n%n", DocumentPreprocessor.class.getName()));
        sb.append("Options:").append(nl);
        sb.append("-xml delim              : XML input with associated delimiter.").append(nl);
        sb.append("-encoding type          : Input encoding (default: UTF-8).").append(nl);
        sb.append("-printSentenceLengths   : ").append(nl);
        sb.append("-noTokenization         : Split on newline delimiters only.").append(nl);
        sb.append("-printOriginalText      : Print the original, not normalized form of tokens.").append(nl);
        sb.append("-suppressEscaping       : Suppress PTB escaping.").append(nl);
        sb.append("-tokenizerOptions opts  : Specify custom tokenizer options.").append(nl);
        sb.append("-tag delim              : Input tokens are tagged. Split tags.").append(nl);
        sb.append("-whitespaceTokenization : Whitespace tokenization only.").append(nl);
        return sb.toString();
    }

    private static Map<String, Integer> argOptionDefs() {
        Map<String, Integer> argOptionDefs = Generics.newHashMap();
        argOptionDefs.put("help", 0);
        argOptionDefs.put("xml", 1);
        argOptionDefs.put("encoding", 1);
        argOptionDefs.put("printSentenceLengths", 0);
        argOptionDefs.put("noTokenization", 0);
        argOptionDefs.put("suppressEscaping", 0);
        argOptionDefs.put("tag", 1);
        argOptionDefs.put("tokenizerOptions", 1);
        argOptionDefs.put("whitespaceTokenization", 0);
        return argOptionDefs;
    }

    public static void main(String[] args) throws IOException {
        boolean whitespaceTokenization;
        boolean printOriginalText;
        boolean customTokenizer;
        Properties options = StringUtils.argsToProperties(args, DocumentPreprocessor.argOptionDefs());
        if (options.containsKey("help")) {
            System.err.println(DocumentPreprocessor.usage());
            return;
        }
        String encoding = options.getProperty("encoding", "utf-8");
        boolean printSentenceLengths = PropertiesUtils.getBool(options, "printSentenceLengths", false);
        String xmlElementDelimiter = options.getProperty("xml", null);
        DocType docType = xmlElementDelimiter == null ? DocType.Plain : DocType.XML;
        String sentenceDelimiter = options.containsKey("noTokenization") ? System.getProperty("line.separator") : null;
        String tagDelimiter = options.getProperty("tag", null);
        String[] sentenceDelims = null;
        int numFactoryFlags = 0;
        boolean suppressEscaping = options.containsKey("suppressEscaping");
        if (suppressEscaping) {
            ++numFactoryFlags;
        }
        if (customTokenizer = options.containsKey("tokenizerOptions")) {
            ++numFactoryFlags;
        }
        if (printOriginalText = options.containsKey("printOriginalText")) {
            ++numFactoryFlags;
        }
        if (whitespaceTokenization = options.containsKey("whitespaceTokenization")) {
            ++numFactoryFlags;
        }
        if (numFactoryFlags > 1) {
            System.err.println("Only one tokenizer flag allowed at a time: ");
            System.err.println("  -suppressEscaping, -tokenizerOptions, -printOriginalText, -whitespaceTokenization");
            return;
        }
        TokenizerFactory<CoreLabel> tf = null;
        if (suppressEscaping) {
            tf = PTBTokenizer.factory(new CoreLabelTokenFactory(), "ptb3Escaping=false");
        } else if (customTokenizer) {
            tf = PTBTokenizer.factory(new CoreLabelTokenFactory(), options.getProperty("tokenizerOptions"));
        } else if (printOriginalText) {
            tf = PTBTokenizer.factory(new CoreLabelTokenFactory(), "invertible=true");
        } else if (whitespaceTokenization) {
            ArrayList<String> whitespaceDelims = new ArrayList<String>(Arrays.asList(DEFAULT_SENTENCE_DELIMS));
            whitespaceDelims.add("\n");
            sentenceDelims = whitespaceDelims.toArray(new String[whitespaceDelims.size()]);
        } else {
            tf = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
        }
        String fileList = options.getProperty("", null);
        String[] files = fileList == null ? new String[1] : fileList.split("\\s+");
        int numSents = 0;
        PrintWriter pw = new PrintWriter((Writer)new OutputStreamWriter((OutputStream)System.out, encoding), true);
        for (String file : files) {
            DocumentPreprocessor docPreprocessor = file == null || file.isEmpty() ? new DocumentPreprocessor(new InputStreamReader(System.in, encoding)) : new DocumentPreprocessor(file, docType, encoding);
            if (docType == DocType.XML) {
                docPreprocessor.setElementDelimiter(xmlElementDelimiter);
            }
            docPreprocessor.setTokenizerFactory(tf);
            if (sentenceDelimiter != null) {
                docPreprocessor.setSentenceDelimiter(sentenceDelimiter);
            }
            if (tagDelimiter != null) {
                docPreprocessor.setTagDelimiter(tagDelimiter);
            }
            if (sentenceDelims != null) {
                docPreprocessor.setSentenceFinalPuncWords(sentenceDelims);
            }
            for (List<HasWord> sentence : docPreprocessor) {
                ++numSents;
                if (printSentenceLengths) {
                    System.err.printf("Length: %d%n", sentence.size());
                }
                boolean printSpace = false;
                for (HasWord word : sentence) {
                    if (printOriginalText) {
                        CoreLabel cl = (CoreLabel)word;
                        if (!printSpace) {
                            pw.print((String)cl.get(CoreAnnotations.BeforeAnnotation.class));
                            printSpace = true;
                        }
                        pw.print((String)cl.get(CoreAnnotations.OriginalTextAnnotation.class));
                        pw.print((String)cl.get(CoreAnnotations.AfterAnnotation.class));
                        continue;
                    }
                    if (printSpace) {
                        pw.print(" ");
                    }
                    printSpace = true;
                    pw.print(word.word());
                }
                pw.println();
            }
        }
        pw.close();
        System.err.printf("Read in %d sentences.%n", numSents);
    }

    private class XMLIterator
    implements Iterator<List<HasWord>> {
        private final XMLBeginEndIterator<String> xmlItr;
        private final Reader originalDocReader;
        private PlainTextIterator plainItr;
        private List<HasWord> nextSent;

        public XMLIterator() {
            this.xmlItr = new XMLBeginEndIterator(DocumentPreprocessor.this.inputReader, DocumentPreprocessor.this.elementDelimiter);
            this.originalDocReader = DocumentPreprocessor.this.inputReader;
            this.primeNext();
        }

        private void primeNext() {
            do {
                if (this.plainItr != null && this.plainItr.hasNext()) {
                    this.nextSent = this.plainItr.next();
                    continue;
                }
                if (this.xmlItr.hasNext()) {
                    String block = this.xmlItr.next();
                    DocumentPreprocessor.this.inputReader = new BufferedReader(new StringReader(block));
                    this.plainItr = new PlainTextIterator();
                    if (this.plainItr.hasNext()) {
                        this.nextSent = this.plainItr.next();
                        continue;
                    }
                    this.nextSent = null;
                    continue;
                }
                IOUtils.closeIgnoringExceptions(this.originalDocReader);
                this.nextSent = null;
                break;
            } while (this.nextSent == null);
        }

        @Override
        public boolean hasNext() {
            return this.nextSent != null;
        }

        @Override
        public List<HasWord> next() {
            if (this.nextSent == null) {
                throw new NoSuchElementException();
            }
            List<HasWord> thisSentence = this.nextSent;
            this.primeNext();
            return thisSentence;
        }

        @Override
        public void remove() {
            throw new UnsupportedOperationException();
        }
    }

    private class PlainTextIterator
    implements Iterator<List<HasWord>> {
        private final Tokenizer<? extends HasWord> tokenizer;
        private final Set<String> sentDelims;
        private final Set<String> delimFollowers;
        private final Function<String, String[]> splitTag;
        private List<HasWord> nextSent;
        private final List<HasWord> nextSentCarryover = Generics.newArrayList();

        public PlainTextIterator() {
            boolean eolIsSignificant = false;
            this.sentDelims = Generics.newHashSet();
            if (DocumentPreprocessor.this.sentenceDelimiter == null) {
                if (DocumentPreprocessor.this.sentenceFinalPuncWords != null) {
                    this.sentDelims.addAll(Arrays.asList(DocumentPreprocessor.this.sentenceFinalPuncWords));
                }
                this.delimFollowers = Generics.newHashSet(Arrays.asList(DocumentPreprocessor.this.sentenceFinalFollowers));
            } else {
                this.sentDelims.add(DocumentPreprocessor.this.sentenceDelimiter);
                this.delimFollowers = Generics.newHashSet();
                eolIsSignificant = wsPattern.matcher(DocumentPreprocessor.this.sentenceDelimiter).matches();
                if (eolIsSignificant) {
                    this.sentDelims.add("*NL*");
                }
            }
            if (DocumentPreprocessor.this.tokenizerFactory == null) {
                eolIsSignificant = this.sentDelims.contains("\n");
                this.tokenizer = WhitespaceTokenizer.newWordWhitespaceTokenizer(DocumentPreprocessor.this.inputReader, eolIsSignificant);
            } else {
                this.tokenizer = eolIsSignificant ? DocumentPreprocessor.this.tokenizerFactory.getTokenizer(DocumentPreprocessor.this.inputReader, "tokenizeNLs") : DocumentPreprocessor.this.tokenizerFactory.getTokenizer(DocumentPreprocessor.this.inputReader);
            }
            this.splitTag = DocumentPreprocessor.this.tagDelimiter == null ? null : new Function<String, String[]>(){
                private final String splitRegex;
                {
                    this.splitRegex = String.format("%s(?!.*%s)", DocumentPreprocessor.this.tagDelimiter, DocumentPreprocessor.this.tagDelimiter);
                }

                @Override
                public String[] apply(String in) {
                    String[] splits = in.trim().split(this.splitRegex);
                    if (splits.length == 2) {
                        return splits;
                    }
                    String[] oldStr = new String[]{in};
                    return oldStr;
                }
            };
        }

        private void primeNext() {
            if (DocumentPreprocessor.this.inputReader == null) {
                return;
            }
            this.nextSent = Generics.newArrayList(this.nextSentCarryover);
            this.nextSentCarryover.clear();
            boolean seenBoundary = false;
            if (!this.tokenizer.hasNext()) {
                IOUtils.closeIgnoringExceptions(DocumentPreprocessor.this.inputReader);
                DocumentPreprocessor.this.inputReader = null;
                if (this.nextSent.isEmpty()) {
                    this.nextSent = null;
                }
                return;
            }
            do {
                HasWord token = this.tokenizer.next();
                if (this.splitTag != null) {
                    String[] toks = this.splitTag.apply(token.word());
                    token.setWord(toks[0]);
                    if (token instanceof Label) {
                        ((Label)((Object)token)).setValue(toks[0]);
                    }
                    if (toks.length == 2 && token instanceof HasTag) {
                        ((HasTag)((Object)token)).setTag(toks[1]);
                    }
                }
                if (this.sentDelims.contains(token.word())) {
                    seenBoundary = true;
                } else if (seenBoundary && !this.delimFollowers.contains(token.word())) {
                    this.nextSentCarryover.add(token);
                    break;
                }
                if (!wsPattern.matcher(token.word()).matches() && !token.word().equals("*NL*")) {
                    this.nextSent.add(token);
                }
                if (!seenBoundary || !this.delimFollowers.isEmpty()) continue;
                if (!this.nextSent.isEmpty() || DocumentPreprocessor.this.keepEmptySentences) break;
                seenBoundary = false;
            } while (this.tokenizer.hasNext());
            if (this.nextSent.isEmpty() && this.nextSentCarryover.isEmpty() && !DocumentPreprocessor.this.keepEmptySentences) {
                IOUtils.closeIgnoringExceptions(DocumentPreprocessor.this.inputReader);
                DocumentPreprocessor.this.inputReader = null;
                this.nextSent = null;
            } else if (DocumentPreprocessor.this.escaper != null) {
                this.nextSent = (List)DocumentPreprocessor.this.escaper.apply(this.nextSent);
            }
        }

        @Override
        public boolean hasNext() {
            if (this.nextSent == null) {
                this.primeNext();
            }
            return this.nextSent != null;
        }

        @Override
        public List<HasWord> next() {
            if (this.nextSent == null) {
                this.primeNext();
            }
            if (this.nextSent == null) {
                throw new NoSuchElementException();
            }
            List<HasWord> thisIteration = this.nextSent;
            this.nextSent = null;
            return thisIteration;
        }

        @Override
        public void remove() {
            throw new UnsupportedOperationException();
        }
    }

    public static enum DocType {
        Plain,
        XML;

    }
}

