/*
 * Decompiled with CFR 0.152.
 */
package org.ujmp.core.text;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.StringTokenizer;
import org.ujmp.core.DenseMatrix;
import org.ujmp.core.Matrix;
import org.ujmp.core.collections.list.FastArrayList;
import org.ujmp.core.doublematrix.DenseDoubleMatrix2D;
import org.ujmp.core.doublematrix.impl.DefaultSparseDoubleMatrix;
import org.ujmp.core.text.DefaultTextBlock;
import org.ujmp.core.text.DefaultTextSentence;
import org.ujmp.core.text.DefaultTextToken;
import org.ujmp.core.text.TextSentence;
import org.ujmp.core.text.TextToken;
import org.ujmp.core.util.VerifyUtil;

/*
 * This class specifies class file version 49.0 but uses Java 6 signatures.  Assumed Java 6.
 */
public abstract class TextUtil {
    public static final String ALPHA_NUMERIC_REGEX = "^[A-Za-z0-9]+$";
    public static final String HAS_DASH_REGEX = "^.+-.+$";
    public static final String INIT_DASH_REGEX = "^-.+$";
    public static final String END_DASH_REGEX = "^-.+$";
    public static final String PUNCTUATION_REGEX = "^[,.:;!?]$";
    public static final String ONE_QUESTION_MARK_REGEX = "^[?]$";
    public static final String TWO_QUESTION_MARKS_REGEX = "^[??]$";
    public static final String THREE_QUESTION_MARKS_REGEX = "^[???]$";
    public static final String MULTIPLE_QUESTION_MARKS_REGEX = "^[?][?]+$";
    public static final String ONE_EXCLAMATION_MARK_REGEX = "^[!]$";
    public static final String TWO_EXCLAMATION_MARKS_REGEX = "^[!!]$";
    public static final String THREE_EXCLAMATION_MARKS_REGEX = "^[!!!]$";
    public static final String MULTIPLE_EXCLAMATION_MARKS_REGEX = "^[!][!]+$";
    public static final String QUESTION_EXCLAMATION_MARK_REGEX = "^[?][!]$";
    public static final String EXCLAMATION_QUESTION_MARK_REGEX = "^[!][?]$";
    public static final String INIT_CAPS_REGEX = "^[A-Z].+$";
    public static final String INIT_CAPS_ALPHA_REGEX = "^[A-Z][a-z]+$";
    public static final String ONE_CAP_REGEX = "^[A-Z]$";
    public static final String TWO_CAPS_REGEX = "^[A-Z][A-Z]$";
    public static final String THREE_CAPS_REGEX = "^[A-Z][A-Z][A-Z]$";
    public static final String FOUR_CAPS_REGEX = "^[A-Z][A-Z][A-Z][A-Z]$";
    public static final String ALL_CAPS_REGEX = "^[A-Z]+$";
    public static final String CAPS_MIX_REGEX = "^[A-Za-z]+$";
    public static final String ONE_DIGIT_REGEX = "^[0-9]$";
    public static final String TWO_DIGITS_REGEX = "^[0-9][0-9]$";
    public static final String THREE_DIGITS_REGEX = "^[0-9][0-9][0-9]$";
    public static final String FOUR_DIGITS_REGEX = "^[0-9][0-9][0-9][0-9]$";
    public static final String HAS_DIGIT_REGEX = "^.+[0-9].+$";
    public static final String POSITIVE_INTEGER_REGEX = "^[0-9]+$";
    public static final String NEGATIVE_INTEGER_REGEX = "^-[0-9]+$";
    public static final String FLOATING_POINT_NUMBER_REGEX = "^[-+]?[0-9]*\\.?[0-9]+$";
    public static final String EXP_NUMBER_REGEX = "^[-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?$";
    public static final String ROMAN_NUMBER_SMALL_REGEX = "^[ivxdlcm]+$";
    public static final String ROMAN_NUMBER_CAPITAL_REGEX = "^[IVXDLCM]+$";
    public static final String SINGLE_INITIAL_REGEX = "^[a-zA-Z]\\.$";
    public static final String IN_PARENTHESES_REGEX = "^(.+)$";
    public static final String OBD_REGEX = "^[PBCU][0-9A-F][0-9A-F][0-9A-F][0-9A-F]$";
    public static final String YEAR_REGEX = "^[12][0-9][0-9][0-9]$";
    public static final String HEX_REGEX = "^[0-9A-Fa-f][0-9A-Fa-f]+$";
    public static final String EMAIL_REGEX = "^([a-z0-9_\\.-]+)@([\\da-z\\.-]+)\\.([a-z\\.]{2,6})$";
    public static final String IP_REGEX = "^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$";
    public static final String HTML_REGEX = "^<([a-z]+)([^<]+)*(?:>(.*)<\\/\\1>|\\s+\\/>)$";
    public static final String URL_REGEX = "^(https?:\\/\\/)?([\\da-z\\.-]+)\\.([a-z\\.]{2,6})([\\/\\w \\.-]*)*\\/?$";

    public static final DenseDoubleMatrix2D getCharacterFrequencies(String s, char ... validCharacters) {
        String copy = s.toLowerCase();
        boolean[] isValidCharacter = TextUtil.createCharacterVector(validCharacters);
        double[] freq = new double[256];
        int i = copy.length();
        while (--i != -1) {
            char c = copy.charAt(i);
            if (c >= '\u0100' || !isValidCharacter[c]) continue;
            char c2 = c;
            freq[c2] = freq[c2] + 1.0;
        }
        return Matrix.Factory.linkToArray(freq);
    }

    public static final List<String> splitLineIntoSentences(String line) {
        VerifyUtil.verifyNotNull(line, "text cannot be null");
        VerifyUtil.verifyTrue(line.split("\n").length == 1, "text must be in one line");
        StringTokenizer tokenizer = new StringTokenizer(line, ".:;!?", true);
        FastArrayList<String> tokens = new FastArrayList<String>();
        String token1 = null;
        String token2 = null;
        String token3 = null;
        while (tokenizer.hasMoreTokens()) {
            token1 = token2;
            token2 = token3;
            token3 = tokenizer.nextToken();
            if (token2 != null && token2.trim().isEmpty() || token1 == null || token2 == null || token3 == null) continue;
            if (".".equals(token2)) {
                if (tokens.isEmpty()) {
                    tokens.add(token1 + token2);
                    continue;
                }
                tokens.set(tokens.size() - 1, (String)tokens.get(tokens.size() - 1) + token2);
                continue;
            }
            if (":".equals(token2)) {
                if (tokens.isEmpty()) {
                    tokens.add(token1 + token2);
                    continue;
                }
                tokens.set(tokens.size() - 1, (String)tokens.get(tokens.size() - 1) + token2);
                continue;
            }
            if (";".equals(token2)) {
                if (tokens.isEmpty()) {
                    tokens.add(token1 + token2);
                    continue;
                }
                tokens.set(tokens.size() - 1, (String)tokens.get(tokens.size() - 1) + token2);
                continue;
            }
            if ("!".equals(token2)) {
                if (tokens.isEmpty()) {
                    tokens.add(token1 + token2);
                    continue;
                }
                tokens.set(tokens.size() - 1, (String)tokens.get(tokens.size() - 1) + token2);
                continue;
            }
            if ("?".equals(token2)) {
                if (tokens.isEmpty()) {
                    tokens.add(token1 + token2);
                    continue;
                }
                tokens.set(tokens.size() - 1, (String)tokens.get(tokens.size() - 1) + token2);
                continue;
            }
            if (tokens.isEmpty()) {
                tokens.add(token2);
                continue;
            }
            if (TextUtil.endsWithAbbreviation((String)tokens.get(tokens.size() - 1))) {
                tokens.set(tokens.size() - 1, (String)tokens.get(tokens.size() - 1) + token2);
                continue;
            }
            tokens.add(token2);
        }
        if (token1 != null && token2 != null && token3 != null) {
            if (".".equals(token3)) {
                tokens.set(tokens.size() - 1, (String)tokens.get(tokens.size() - 1) + token3);
            } else if (":".equals(token3)) {
                tokens.set(tokens.size() - 1, (String)tokens.get(tokens.size() - 1) + token3);
            } else if (";".equals(token3)) {
                tokens.set(tokens.size() - 1, (String)tokens.get(tokens.size() - 1) + token3);
            } else if ("?".equals(token3)) {
                tokens.set(tokens.size() - 1, (String)tokens.get(tokens.size() - 1) + token3);
            } else if ("!".equals(token3)) {
                tokens.set(tokens.size() - 1, (String)tokens.get(tokens.size() - 1) + token3);
            }
        } else if (token1 != null && token2 != null && token3 == null) {
            tokens.add(token1);
            tokens.add(token2);
        } else if (token1 != null && token2 == null && token3 == null) {
            tokens.add(token1);
        }
        return tokens;
    }

    public static final DefaultTextBlock splitTextIntoObjects(String text) {
        DefaultTextBlock textBlock = new DefaultTextBlock(new TextSentence[0]);
        text = text.replaceAll("\r\n", " ").replaceAll("\n", " ");
        List<String> sentenceTexts = TextUtil.splitLineIntoSentences(text);
        int s = 0;
        for (String sentenceText : sentenceTexts) {
            DefaultTextSentence sentence = new DefaultTextSentence(new TextToken[0]);
            sentence.setMetaData("Id", s++);
            List<String> tokenTexts = TextUtil.splitSentenceIntoTokens(sentenceText, 0);
            int i = 0;
            for (String tokenText : tokenTexts) {
                DefaultTextToken token = new DefaultTextToken(tokenText);
                token.put("Id", i++);
                sentence.add(token);
            }
            textBlock.add(sentence);
        }
        return textBlock;
    }

    public static final List<List<String>> createWordTrigrams(String text) {
        FastArrayList<List<String>> wordTrigrams = new FastArrayList<List<String>>();
        List<String> lines = TextUtil.splitTextIntoLines(text);
        for (String line : lines) {
            List<String> sentences = TextUtil.splitLineIntoSentences(line);
            for (String sentence : sentences) {
                wordTrigrams.addAll(TextUtil.createWordTrigrams(TextUtil.splitSentenceIntoTokens(sentence, 3)));
            }
        }
        return wordTrigrams;
    }

    public static final List<String> splitTextIntoLines(String text) {
        VerifyUtil.verifyNotNull(text, "text cannot be null");
        return Arrays.asList(text.split("\n"));
    }

    public static final List<List<String>> createWordBigrams(String text) {
        FastArrayList<List<String>> wordBigrams = new FastArrayList<List<String>>();
        List<String> lines = TextUtil.splitTextIntoLines(text);
        for (String line : lines) {
            List<String> sentences = TextUtil.splitLineIntoSentences(line);
            for (String sentence : sentences) {
                wordBigrams.addAll(TextUtil.createWordBigrams(TextUtil.splitSentenceIntoTokens(sentence, 2)));
            }
        }
        return wordBigrams;
    }

    public static final List<String> createWordUnigrams(String text, int ngramSize) {
        FastArrayList<String> wordUnigrams = new FastArrayList<String>();
        List<String> lines = TextUtil.splitTextIntoLines(text);
        for (String line : lines) {
            List<String> sentences = TextUtil.splitLineIntoSentences(line);
            for (String sentence : sentences) {
                wordUnigrams.addAll(TextUtil.splitSentenceIntoTokens(sentence, ngramSize));
            }
        }
        return wordUnigrams;
    }

    public static Map<List<String>, Integer> getWordBigramCounts(String text) {
        List<List<String>> wordBigrams = TextUtil.createWordBigrams(text);
        HashMap<List<String>, Integer> wordBigramCounts = new HashMap<List<String>, Integer>();
        for (List<String> wordBigram : wordBigrams) {
            Integer count = (Integer)wordBigramCounts.get(wordBigram);
            count = count == null ? 1 : count + 1;
            wordBigramCounts.put(wordBigram, count);
        }
        return wordBigramCounts;
    }

    public static Map<String, Integer> getWordUnigramCounts(String text, int ngramSize) {
        List<String> wordUnigrams = TextUtil.createWordUnigrams(text, ngramSize);
        HashMap<String, Integer> wordUnigramCounts = new HashMap<String, Integer>();
        for (String wordUnigram : wordUnigrams) {
            Integer count = (Integer)wordUnigramCounts.get(wordUnigram);
            count = count == null ? 1 : count + 1;
            wordUnigramCounts.put(wordUnigram, count);
        }
        return wordUnigramCounts;
    }

    public static final List<String> splitSentenceIntoTokens(String sentence, int ngramSize) {
        int i;
        VerifyUtil.verifyNotNull(sentence, "text cannot be null");
        VerifyUtil.verifyTrue(sentence.split("\n").length == 1, "text must be in one line");
        sentence.replace('\u00a0', ' ');
        StringTokenizer tokenizer = new StringTokenizer(sentence, " \u00a0.;,\u3001\u060c:&\\\u2044/\u201d\u201c\u2018\"\u2015\u2014\u2013\u2012\u2010-!?{}()[]", true);
        FastArrayList<String> tokens = new FastArrayList<String>();
        for (i = 1; i < ngramSize; ++i) {
            tokens.add("".intern());
        }
        while (tokenizer.hasMoreTokens()) {
            String token = tokenizer.nextToken().trim();
            if (token.isEmpty() || token.equals("\u00a0")) continue;
            tokens.add(token.intern());
        }
        for (i = 1; i < ngramSize; ++i) {
            tokens.add("".intern());
        }
        return tokens;
    }

    public static final List<List<String>> createWordBigrams(List<String> words) {
        FastArrayList<List<String>> wordBigrams = new FastArrayList<List<String>>();
        for (int i = 0; i < words.size() - 1; ++i) {
            String word1 = words.get(i);
            String word2 = words.get(i + 1);
            FastArrayList<String> bigram = new FastArrayList<String>(2);
            bigram.add(word1);
            bigram.add(word2);
            wordBigrams.add((List<String>)bigram);
        }
        return wordBigrams;
    }

    public static final List<List<String>> createWordTrigrams(List<String> words) {
        FastArrayList<List<String>> wordTrigrams = new FastArrayList<List<String>>();
        for (int i = 0; i < words.size() - 2; ++i) {
            String word1 = words.get(i);
            String word2 = words.get(i + 1);
            String word3 = words.get(i + 2);
            FastArrayList<String> trigram = new FastArrayList<String>(3);
            trigram.add(word1);
            trigram.add(word2);
            trigram.add(word3);
            wordTrigrams.add((List<String>)trigram);
        }
        return wordTrigrams;
    }

    public static boolean endsWithAbbreviation(String string) {
        if ((string = string.toLowerCase()).endsWith(" 0.")) {
            return true;
        }
        if (string.endsWith(" 1.")) {
            return true;
        }
        if (string.endsWith(" 2.")) {
            return true;
        }
        if (string.endsWith(" 3.")) {
            return true;
        }
        if (string.endsWith(" 4.")) {
            return true;
        }
        if (string.endsWith(" 5.")) {
            return true;
        }
        if (string.endsWith(" 6.")) {
            return true;
        }
        if (string.endsWith(" 7.")) {
            return true;
        }
        if (string.endsWith(" 8.")) {
            return true;
        }
        if (string.endsWith(" 9.")) {
            return true;
        }
        if (string.endsWith("10.")) {
            return true;
        }
        if (string.endsWith("11.")) {
            return true;
        }
        if (string.endsWith("12.")) {
            return true;
        }
        if (string.endsWith("13.")) {
            return true;
        }
        if (string.endsWith("14.")) {
            return true;
        }
        if (string.endsWith("15.")) {
            return true;
        }
        if (string.endsWith("16.")) {
            return true;
        }
        if (string.endsWith("17.")) {
            return true;
        }
        if (string.endsWith("18.")) {
            return true;
        }
        if (string.endsWith("19.")) {
            return true;
        }
        if (string.endsWith("20.")) {
            return true;
        }
        if (string.endsWith("21.")) {
            return true;
        }
        if (string.endsWith("22.")) {
            return true;
        }
        if (string.endsWith("23.")) {
            return true;
        }
        if (string.endsWith("24.")) {
            return true;
        }
        if (string.endsWith("25.")) {
            return true;
        }
        if (string.endsWith("26.")) {
            return true;
        }
        if (string.endsWith("27.")) {
            return true;
        }
        if (string.endsWith("28.")) {
            return true;
        }
        if (string.endsWith("29.")) {
            return true;
        }
        if (string.endsWith("30.")) {
            return true;
        }
        if (string.endsWith("31.")) {
            return true;
        }
        if (string.endsWith("32.")) {
            return true;
        }
        if (string.endsWith("33.")) {
            return true;
        }
        if (string.endsWith("34.")) {
            return true;
        }
        if (string.endsWith("35.")) {
            return true;
        }
        if (string.endsWith("36.")) {
            return true;
        }
        if (string.endsWith("37.")) {
            return true;
        }
        if (string.endsWith("38.")) {
            return true;
        }
        if (string.endsWith("39.")) {
            return true;
        }
        if (string.endsWith(" 0:")) {
            return true;
        }
        if (string.endsWith(" 1:")) {
            return true;
        }
        if (string.endsWith(" 2:")) {
            return true;
        }
        if (string.endsWith(" 3:")) {
            return true;
        }
        if (string.endsWith(" 4:")) {
            return true;
        }
        if (string.endsWith(" 5:")) {
            return true;
        }
        if (string.endsWith(" 6:")) {
            return true;
        }
        if (string.endsWith(" 7:")) {
            return true;
        }
        if (string.endsWith(" 8:")) {
            return true;
        }
        if (string.endsWith(" 9:")) {
            return true;
        }
        if (string.endsWith(" a.")) {
            return true;
        }
        if (string.endsWith(" b.")) {
            return true;
        }
        if (string.endsWith(" c.")) {
            return true;
        }
        if (string.endsWith(" d.")) {
            return true;
        }
        if (string.endsWith(" e.")) {
            return true;
        }
        if (string.endsWith(" f.")) {
            return true;
        }
        if (string.endsWith(" g.")) {
            return true;
        }
        if (string.endsWith(" h.")) {
            return true;
        }
        if (string.endsWith(" i.")) {
            return true;
        }
        if (string.endsWith(" j.")) {
            return true;
        }
        if (string.endsWith(" k.")) {
            return true;
        }
        if (string.endsWith(" l.")) {
            return true;
        }
        if (string.endsWith(" m.")) {
            return true;
        }
        if (string.endsWith(" n.")) {
            return true;
        }
        if (string.endsWith(" o.")) {
            return true;
        }
        if (string.endsWith(" p.")) {
            return true;
        }
        if (string.endsWith(" q.")) {
            return true;
        }
        if (string.endsWith(" r.")) {
            return true;
        }
        if (string.endsWith(" s.")) {
            return true;
        }
        if (string.endsWith(" t.")) {
            return true;
        }
        if (string.endsWith(" u.")) {
            return true;
        }
        if (string.endsWith(" v.")) {
            return true;
        }
        if (string.endsWith(" w.")) {
            return true;
        }
        if (string.endsWith(" x.")) {
            return true;
        }
        if (string.endsWith(" y.")) {
            return true;
        }
        if (string.endsWith(" z.")) {
            return true;
        }
        if (string.endsWith(" \u00e4.")) {
            return true;
        }
        if (string.endsWith(" \u00f6.")) {
            return true;
        }
        if (string.endsWith(" \u00fc.")) {
            return true;
        }
        if (string.endsWith(" \u00df.")) {
            return true;
        }
        if (string.endsWith(".a.")) {
            return true;
        }
        if (string.endsWith(".b.")) {
            return true;
        }
        if (string.endsWith(".c.")) {
            return true;
        }
        if (string.endsWith(".d.")) {
            return true;
        }
        if (string.endsWith(".e.")) {
            return true;
        }
        if (string.endsWith(".f.")) {
            return true;
        }
        if (string.endsWith(".g.")) {
            return true;
        }
        if (string.endsWith(".h.")) {
            return true;
        }
        if (string.endsWith(".i.")) {
            return true;
        }
        if (string.endsWith(".j.")) {
            return true;
        }
        if (string.endsWith(".k.")) {
            return true;
        }
        if (string.endsWith(".l.")) {
            return true;
        }
        if (string.endsWith(".m.")) {
            return true;
        }
        if (string.endsWith(".n.")) {
            return true;
        }
        if (string.endsWith(".o.")) {
            return true;
        }
        if (string.endsWith(".p.")) {
            return true;
        }
        if (string.endsWith(".q.")) {
            return true;
        }
        if (string.endsWith(".r.")) {
            return true;
        }
        if (string.endsWith(".s.")) {
            return true;
        }
        if (string.endsWith(".t.")) {
            return true;
        }
        if (string.endsWith(".u.")) {
            return true;
        }
        if (string.endsWith(".v.")) {
            return true;
        }
        if (string.endsWith(".w.")) {
            return true;
        }
        if (string.endsWith(".x.")) {
            return true;
        }
        if (string.endsWith(".y.")) {
            return true;
        }
        if (string.endsWith(".z.")) {
            return true;
        }
        if (string.endsWith(".\u00e4.")) {
            return true;
        }
        if (string.endsWith(".\u00f6.")) {
            return true;
        }
        if (string.endsWith(".\u00fc.")) {
            return true;
        }
        if (string.endsWith(".\u00df.")) {
            return true;
        }
        if (string.endsWith(" ca.")) {
            return true;
        }
        if (string.endsWith(" vs.")) {
            return true;
        }
        if (string.endsWith(" rep.")) {
            return true;
        }
        if (string.endsWith(" etc.")) {
            return true;
        }
        if (string.endsWith(" usw.")) {
            return true;
        }
        if (string.endsWith(" resp.")) {
            return true;
        }
        if (string.endsWith(" incl.")) {
            return true;
        }
        if (string.endsWith(" inkl.")) {
            return true;
        }
        if (string.endsWith(" insges.")) {
            return true;
        }
        if (string.endsWith(" zyl.")) {
            return true;
        }
        if (string.endsWith(" cyl.")) {
            return true;
        }
        if (string.endsWith(" dr.")) {
            return true;
        }
        if (string.endsWith(" prof.")) {
            return true;
        }
        if (string.endsWith(" gr.")) {
            return true;
        }
        if (string.endsWith(" ppm.")) {
            return true;
        }
        return string.endsWith(" ggf.");
    }

    private static final boolean[] createCharacterVector(char ... chars) {
        boolean[] characterVector = new boolean[256];
        if (chars.length == 0) {
            Arrays.fill(characterVector, true);
        } else {
            for (char c : chars) {
                if (c >= '\u0100') continue;
                characterVector[c] = true;
            }
        }
        return characterVector;
    }

    public static final Matrix getCharacterBigramFrequencies(String s, char ... validCharacters) {
        String copy = " " + s.toLowerCase() + " ";
        boolean[] isValidCharacter = TextUtil.createCharacterVector(validCharacters);
        DefaultSparseDoubleMatrix m = new DefaultSparseDoubleMatrix(65536L, 1L);
        int i = copy.length();
        while (--i != 0) {
            char c1 = copy.charAt(i - 1);
            char c2 = copy.charAt(i);
            if (c1 >= '\u0100' || c2 >= '\u0100' || !isValidCharacter[c1] || !isValidCharacter[c2]) continue;
            int index = c1 * 256 + c2;
            m.setAsDouble(m.getAsDouble(index, 0L) + 1.0, index, 0L);
        }
        return m;
    }

    public static final Matrix getCharacterTrigramFrequencies(String s, char ... validCharacters) {
        String copy = " " + s.toLowerCase() + " ";
        boolean[] isValidCharacter = TextUtil.createCharacterVector(validCharacters);
        DefaultSparseDoubleMatrix m = new DefaultSparseDoubleMatrix(0x1000000L, 1L);
        int i = copy.length();
        while (--i != 0) {
            char c1 = copy.charAt(i - 2);
            char c2 = copy.charAt(i - 1);
            char c3 = copy.charAt(i);
            if (c1 >= '\u0100' || c2 >= '\u0100' || c3 >= '\u0100' || !isValidCharacter[c1] || !isValidCharacter[c2] || !isValidCharacter[c3]) continue;
            int index = c1 * 65536 + c2 * 256 + c3;
            m.setAsDouble(m.getAsDouble(index, 0L) + 1.0, index, 0L);
        }
        return m;
    }

    public static final Matrix createBagOfWordsVector(String string, List<String> dictionary) {
        DenseMatrix m = Matrix.Factory.zeros((long)dictionary.size(), 1L);
        StringTokenizer st = new StringTokenizer(string, " \t\n\r\f,.:;?![]'");
        while (st.hasMoreElements()) {
            long index = dictionary.indexOf(st.nextElement());
            m.setAsDouble(m.getAsDouble(index, 0L) + 1.0, index, 0L);
        }
        return m;
    }

    public static Collection<TextToken> convertSentenceToTextTokens(String sentence) {
        List<String> strings = TextUtil.splitSentenceIntoTokens(sentence, 1);
        ArrayList<TextToken> tokens = new ArrayList<TextToken>();
        for (String string : strings) {
            DefaultTextToken textToken = new DefaultTextToken(string);
            tokens.add(textToken);
        }
        return tokens;
    }

    public static Collection<TextSentence> convertToTextBlockToSentences(String text) {
        List<String> strings = TextUtil.splitTextIntoSentences(text);
        ArrayList<TextSentence> sentences = new ArrayList<TextSentence>();
        for (String string : strings) {
            DefaultTextSentence textToken = new DefaultTextSentence(string);
            sentences.add(textToken);
        }
        return sentences;
    }

    public static List<String> splitTextIntoSentences(String text) {
        text = text.replaceAll("\n", " ");
        return TextUtil.splitLineIntoSentences(text);
    }

    public static Matrix stringToVector(String string) {
        return TextUtil.stringToVector(string, 131072);
    }

    public static Matrix stringToVector(String string, int size) {
        DenseMatrix m = Matrix.Factory.zeros((long)size, 1L);
        StringTokenizer st = new StringTokenizer(string, " \t\n\r\f,.:;?![]'");
        while (st.hasMoreElements()) {
            long index = Math.abs(st.nextElement().toString().toLowerCase().hashCode()) % size;
            m.setAsDouble(m.getAsDouble(index, 0L) + 1.0, index, 0L);
        }
        return m;
    }
}

