package de.lmu.ifi.dbs.elki.utilities.io;

/*
 This file is part of ELKI:
 Environment for Developing KDD-Applications Supported by Index-Structures

 Copyright (C) 2015
 Ludwig-Maximilians-Universität München
 Lehr- und Forschungseinheit für Datenbanksysteme
 ELKI Development Team

 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU Affero General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.

 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU Affero General Public License for more details.

 You should have received a copy of the GNU Affero General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

import java.util.regex.Matcher;
import java.util.regex.Pattern;

import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.utilities.FormatUtil;
import de.lmu.ifi.dbs.elki.utilities.datastructures.iterator.Iter;

/**
 * String tokenizer.
 * 
 * @author Erich Schubert
 * @since 0.6.0
 */
public class Tokenizer implements Iter {
  /**
   * Class logger.
   */
  private static final Logging LOG = Logging.getLogger(Tokenizer.class);

  /**
   * Quote characters
   */
  public static final String QUOTE_CHAR = "\"'";

  /**
   * Stores the quotation character
   */
  private char[] quoteChars = QUOTE_CHAR.toCharArray();

  /**
   * Constructor.
   * 
   * @param colSep Column separator pattern.
   * @param quoteChars Quotation character.
   */
  public Tokenizer(Pattern colSep, String quoteChars) {
    super();
    assert (colSep != null) : "Column separator may not be null.";
    this.matcher = colSep.matcher("");
    this.quoteChars = quoteChars != null ? quoteChars.toCharArray() : new char[0];
  }

  /**
   * Regular expression match helper.
   */
  private Matcher matcher;

  /**
   * Data currently processed.
   */
  private CharSequence input;

  /**
   * Substring to process.
   */
  private int send;

  /**
   * Current positions of result and iterator.
   */
  private int start, end, index;

  /**
   * Whether the current token is a quoted string.
   */
  private boolean quoted;

  /**
   * Initialize parser with a new string.
   * 
   * @param input New string to parse.
   * @param begin Begin
   * @param end End
   */
  public void initialize(CharSequence input, int begin, int end) {
    this.input = input;
    this.send = end;
    this.matcher.reset(input).region(begin, end);
    this.index = begin;
    advance();
  }

  @Override
  public boolean valid() {
    return start < send;
  }

  @Override
  public Tokenizer advance() {
    char inquote = isQuote(index);
    while(matcher.find()) {
      // Quoted code path vs. regular code path
      if(inquote != 0) {
        // Matching closing quote found?
        if(matcher.start() > index + 1 && input.charAt(matcher.start() - 1) == inquote) {
          this.start = index + 1;
          this.end = matcher.start() - 1;
          this.index = matcher.end();
          this.quoted = true;
          return this;
        }
        continue;
      }
      else {
        this.start = index;
        this.end = matcher.start();
        this.index = matcher.end();
        this.quoted = false;
        return this;
      }
    }
    // Add tail after last separator.
    this.start = index;
    this.end = send;
    this.index = end + 1;
    this.quoted = false;
    if(inquote != 0) {
      final int last = send - 1;
      if(input.charAt(last) == inquote) {
        ++this.start;
        --this.end;
        this.quoted = true;
      }
      else {
        LOG.warning("Invalid quoted line in input: no closing quote found in: " + input);
      }
    }
    return this;
  }

  /**
   * Get the current part as substring
   * 
   * @return Current value as substring.
   */
  public String getSubstring() {
    // TODO: detect Java <6 and make sure we only return the substring?
    // With java 7, String.substring will arraycopy the characters.
    return input.subSequence(start, end).toString();
  }

  /**
   * Get the current part as substring
   * 
   * @return Current value as substring.
   */
  public String getStrippedSubstring() {
    // TODO: detect Java <6 and make sure we only return the substring?
    // With java 7, String.substring will arraycopy the characters.
    int sstart = start, send = end;
    while(sstart < send) {
      char c = input.charAt(sstart);
      if(c != ' ' || c != '\n' || c != '\r' || c != '\t') {
        break;
      }
      ++sstart;
    }
    while(--send >= sstart) {
      char c = input.charAt(send);
      if(c != ' ' || c != '\n' || c != '\r' || c != '\t') {
        break;
      }
    }
    ++send;
    return (sstart < send) ? input.subSequence(sstart, send).toString() : "";
  }

  /**
   * Get current value as double.
   * 
   * @return double value
   * @throws NumberFormatException when current value cannot be parsed as double
   *         value.
   */
  public double getDouble() throws NumberFormatException {
    return FormatUtil.parseDouble(input, start, end);
  }

  /**
   * Get current value as long.
   * 
   * @return double value
   * @throws NumberFormatException when current value cannot be parsed as long
   *         value.
   */
  public long getLongBase10() throws NumberFormatException {
    return FormatUtil.parseLongBase10(input, start, end);
  }

  /**
   * Test for empty tokens; usually at end of line.
   * 
   * @return Empty
   */
  public boolean isEmpty() {
    return end <= start;
  }

  /**
   * Detect quote characters.
   * 
   * TODO: support more than one quote character, make sure opening and closing
   * quotes match then.
   * 
   * @param index Position
   * @return {@code 1} when a quote character, {@code 0} otherwise.
   */
  private char isQuote(int index) {
    if(index >= input.length()) {
      return 0;
    }
    char c = input.charAt(index);
    for(int i = 0; i < quoteChars.length; i++) {
      if(c == quoteChars[i]) {
        return c;
      }
    }
    return 0;
  }

  /**
   * Test if the current string was quoted.
   * 
   * @return {@code true} when quoted.
   */
  public boolean isQuoted() {
    return quoted;
  }

  /**
   * Get start of token.
   * 
   * @return Start
   */
  public int getStart() {
    return start;
  }

  /**
   * Get end of token.
   * 
   * @return End
   */
  public int getEnd() {
    return end;
  }

  /**
   * Perform cleanup.
   */
  public void cleanup() {
    input = null;
    matcher.reset("");
  }
}