view pyrect/regexp/lexer.py @ 53:1f8c474ca8b3

bug fix: modify escape character parsing rule.
author Ryoma SHINYA <shinya@firefly.cr.ie.u-ryukyu.ac.jp>
date Wed, 25 Aug 2010 20:50:52 +0900
parents abb0691e792a
children 5db856953793
line wrap: on
line source

#!/usr/bin/env python
#-*- encoding: utf-8 -*-

from ply import lex

tokens = (
    'UNION',
    'STAR',
    'LPAREN',
    'RPAREN',
    'LBRACKET',
    'RBRACKET',
    'CARET',
    'DOLLAR',
    'NORMALCHAR',
    'MBCHAR',
    'DASH',
    'ANYCHAR',
    'ESCAPECHAR',
    'PLUS',
    'QMARK'
    )

def t_ESCAPECHAR(t):
    ur'\\[ -~]'
    t.value = t.value[-1]
    return t

def t_UNION(t):
    ur'\|'
    return t

def t_STAR(t):
    ur'\*'
    return t

def t_QMARK(t):
    ur'\?'
    return t

def t_PLUS(t):
    ur'\+'
    return t

def t_LPAREN(t):
    ur'\('
    return t

def t_RPAREN(t):
    ur'\)'
    return t

def t_DASH(t):
    ur'-'
    return t

def t_CARET(t):
    ur'\^'
    return t

def t_DOLLAR(t):
    ur'\$'
    return t

def t_RBRACKET(t):
    ur'\['
    return t

def t_LBRACKET(t):
    ur'\]'
    return t

def t_ANYCHAR(t):
    ur'\.'
    return t

def t_NORMALCHAR(t):
    ur'[ -~]' # match ascii
    t.value
    return t

def t_MBCHAR(t):
    # match multi byte code. -> see http://ja.wikipedia.org/wiki/UTF-8
    u'(\xc2-\xdf].|[\xe0-\xef]..|[\xe0-\xef]..|[\xf0-\xf7]...|[\xf8-\xfb]....|[\xfc-\xfd].....)'
    return t

def t_error(t):
    print "Illegal character '%s'" % t.value[0]
    raise t

lex.lex()