# Natural Language Toolkit: Tokenizers
#
# Copyright (C) 2001-2013 NLTK Project
# Author: Edward Loper <edloper@gradient.cis.upenn.edu>
#         Michael Heilman <mheilman@cmu.edu> (re-port from http://www.cis.upenn.edu/~treebank/tokenizer.sed)
import re


SYM_MAP = {
    '(': '-LRB-',
    ')': '-RRB-',
}


class PennTreebankTokenizer:
    """
    The PennTreebankTokenizer uses regular expressions to tokenize text as in
    Penn Treebank. This implementation is a port of the tokenizer sed script
    written by Robert McIntyre and available at:
         http://www.cis.upenn.edu/~treebank/tokenizer.sed
    
    This tokenizer performs the following steps:
    - split standard contractions, e.g. ``don't`` -> ``do n't`` and ``they'll`` -> ``they 'll``
    - treat most punctuation characters as separate tokens
    - split off commas and single quotes, when followed by whitespace
    - separate periods that appear at the end of line
        
        >>> from stat_parser.tokenizer import PennTreebankTokenizer
        >>> t = PennTreebankTokenizer()
        >>> s = '''Good muffins cost $3.88\\nin New York.  Please buy me\\ntwo of them.\\nThanks.'''
        >>> t.tokenize(s)
        ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks', '.']
        >>> s = "They'll save and invest more."
        >>> t.tokenize(s)
        ['They', "'ll", 'save', 'and', 'invest', 'more', '.']
    """
    
    # List of contractions adapted from Robert MacIntyre's tokenizer.
    CONTRACTIONS2 = [re.compile(r"(?i)\b(can)(not)\b"),
                     re.compile(r"(?i)\b(d)('ye)\b"),
                     re.compile(r"(?i)\b(gim)(me)\b"),
                     re.compile(r"(?i)\b(gon)(na)\b"),
                     re.compile(r"(?i)\b(got)(ta)\b"),
                     re.compile(r"(?i)\b(lem)(me)\b"),
                     re.compile(r"(?i)\b(mor)('n)\b"),
                     re.compile(r"(?i)\b(wan)(na) ")]
    CONTRACTIONS3 = [re.compile(r"(?i) ('t)(is)\b"),
                     re.compile(r"(?i) ('t)(was)\b")]
    CONTRACTIONS4 = [re.compile(r"(?i)\b(whad)(dd)(ya)\b"),
                     re.compile(r"(?i)\b(wha)(t)(cha)\b")]
    
    def tokenize(self, text):
        #starting quotes
        text = re.sub(r'^\"', r'``', text)
        text = re.sub(r'(``)', r' \1 ', text)
        text = re.sub(r'([ (\[{<])"', r'\1 `` ', text)
        
        #punctuation
        text = re.sub(r'([:,])([^\d])', r' \1 \2', text)
        text = re.sub(r'\.\.\.', r' ... ', text)
        text = re.sub(r'[;@#$%&]', r' \g<0> ', text)
        text = re.sub(r'([^\.])(\.)([\]\)}>"\']*)\s*$', r'\1 \2\3 ', text)
        text = re.sub(r'[?!]', r' \g<0> ', text)
        
        text = re.sub(r"([^'])' ", r"\1 ' ", text)
        
        #parens, brackets, etc.
        text = re.sub(r'[\]\[\(\)\{\}\<\>]', r' \g<0> ', text)
        text = re.sub(r'--', r' -- ', text)
        
        #add extra space to make things easier
        text = " " + text + " "
        
        #ending quotes
        text = re.sub(r'"', " '' ", text)
        text = re.sub(r'(\S)(\'\')', r'\1 \2 ', text)
        
        text = re.sub(r"([^' ])('[sS]|'[mM]|'[dD]|') ", r"\1 \2 ", text)
        text = re.sub(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) ", r"\1 \2 ",
                      text)
        
        for regexp in self.CONTRACTIONS2:
            text = regexp.sub(r' \1 \2 ', text)
        for regexp in self.CONTRACTIONS3:
            text = regexp.sub(r' \1 \2 ', text)
        
        # We are not using CONTRACTIONS4 since
        # they are also commented out in the SED scripts
        # for regexp in self.CONTRACTIONS4:
        #     text = regexp.sub(r' \1 \2 \3 ', text)
        
        words = []
        tokens = text.split()
        skip = False
        start_quotes = False
        for i, t in enumerate(tokens):
            if skip:
                skip = False
            
            # Tokenization Exceptions
            elif t == '&' and len(tokens[i+1]) == 1:
                words[-1] += '&' + tokens[i+1]
                skip = True
            elif t == '#':
                words.append('#' + tokens[i+1])
                skip = True
            elif t == "'s" and words[-1].isdigit():
                words[-1] += t
            
            # Special Penn symbols: keep track of original in tuple
            elif t in SYM_MAP:
                words.append((SYM_MAP[t], t))
            elif t == '"':
                if start_quotes:
                    start_quotes = False
                    words.append(("''", t))
                else:
                    start_quotes = True
                    words.append(('``', t))
            
            else:
                words.append(t)
        
        return words