This repository has been archived on 2024-06-20. You can view files and clone it, but you cannot make any changes to it's state, such as pushing and creating new issues, pull requests or comments.
coffee.pygments/scripts/debug_lexer.py
2023-11-20 20:36:01 +01:00

305 lines
11 KiB
Python
Executable file

#!/usr/bin/python
"""
Lexing error finder
~~~~~~~~~~~~~~~~~~~
For the source files given on the command line, display
the text where Error tokens are being generated, along
with some context.
:copyright: Copyright 2006-2023 by the Pygments team, see AUTHORS.
:license: BSD, see LICENSE for details.
"""
import os
import sys
import struct
# always prefer Pygments from source if exists
srcpath = os.path.join(os.path.dirname(__file__), '..')
if os.path.isdir(os.path.join(srcpath, 'pygments')):
sys.path.insert(0, srcpath)
# ruff: noqa: E402
from pygments.lexer import ExtendedRegexLexer, LexerContext
from pygments.lexers import get_lexer_by_name, find_lexer_class, \
find_lexer_class_for_filename, guess_lexer
from pygments.token import Error, Text, _TokenType
from pygments.cmdline import _parse_options
class DebuggingRegexLexer(ExtendedRegexLexer):
"""Make the state stack, position and current match instance attributes."""
def get_tokens_unprocessed(self, text, stack=('root',)):
"""
Split ``text`` into (tokentype, text) pairs.
``stack`` is the initial stack (default: ``['root']``)
"""
tokendefs = self._tokens
self.ctx = ctx = LexerContext(text, 0)
ctx.stack = list(stack)
statetokens = tokendefs[ctx.stack[-1]]
while 1:
for rexmatch, action, new_state in statetokens:
self.m = m = rexmatch(text, ctx.pos, ctx.end)
if m:
if action is not None:
if type(action) is _TokenType:
yield ctx.pos, action, m.group()
ctx.pos = m.end()
else:
if not isinstance(self, ExtendedRegexLexer):
yield from action(self, m)
ctx.pos = m.end()
else:
yield from action(self, m, ctx)
if not new_state:
# altered the state stack?
statetokens = tokendefs[ctx.stack[-1]]
if new_state is not None:
# state transition
if isinstance(new_state, tuple):
for state in new_state:
if state == '#pop':
ctx.stack.pop()
elif state == '#push':
ctx.stack.append(ctx.stack[-1])
else:
ctx.stack.append(state)
elif isinstance(new_state, int):
# pop
del ctx.stack[new_state:]
elif new_state == '#push':
ctx.stack.append(ctx.stack[-1])
else:
assert False, 'wrong state def: %r' % new_state
statetokens = tokendefs[ctx.stack[-1]]
break
else:
try:
if ctx.pos >= ctx.end:
break
if text[ctx.pos] == '\n':
# at EOL, reset state to 'root'
ctx.stack = ['root']
statetokens = tokendefs['root']
yield ctx.pos, Text, '\n'
ctx.pos += 1
continue
yield ctx.pos, Error, text[ctx.pos]
ctx.pos += 1
except IndexError:
break
def decode_atheris(bstr):
"""Decode a byte string into a Unicode string using the algorithm
of Google's Atheris fuzzer library, which aims to produce a wide
range of possible Unicode inputs.
Corresponds to ConsumeUnicodeImpl() with filter_surrogates=false in
https://github.com/google/atheris/blob/master/fuzzed_data_provider.cc
"""
if len(bstr) < 2:
return ''
# The first byte only selects if the rest is decoded as ascii, "utf-16" or "utf-32"
spec, bstr = bstr[0], bstr[1:]
if spec & 1: # pure ASCII
return ''.join(chr(ch & 0x7f) for ch in bstr)
elif spec & 2: # UTF-16
bstr = bstr if len(bstr) % 2 == 0 else bstr[:-1]
return bstr.decode('utf16')
# else UTF-32
def valid_codepoint(ch):
ch &= 0x1fffff
if ch & 0x100000:
ch &= ~0x0f0000
return chr(ch)
chars = struct.unpack('%dI%dx' % divmod(len(bstr), 4), bstr)
return ''.join(map(valid_codepoint), chars)
def main(fn, lexer=None, options={}):
if fn == '-':
text = sys.stdin.read()
else:
with open(fn, 'rb') as fp:
text = fp.read()
if decode_strategy == 'latin1':
try:
text = text.decode('utf8')
except UnicodeError:
print('Warning: non-UTF8 input, using latin1')
text = text.decode('latin1')
elif decode_strategy == 'utf8-ignore':
try:
text = text.decode('utf8')
except UnicodeError:
print('Warning: ignoring non-UTF8 bytes in input')
text = text.decode('utf8', 'ignore')
elif decode_strategy == 'atheris':
text = decode_atheris(text)
text = text.strip('\n') + '\n'
if lexer is not None:
lxcls = get_lexer_by_name(lexer).__class__
elif guess:
lxcls = guess_lexer(text).__class__
print('Using lexer: %s (%s.%s)' % (lxcls.name, lxcls.__module__,
lxcls.__name__))
else:
lxcls = find_lexer_class_for_filename(os.path.basename(fn))
if lxcls is None:
name, rest = fn.split('_', 1)
lxcls = find_lexer_class(name)
if lxcls is None:
raise AssertionError('no lexer found for file %r' % fn)
print('Using lexer: %s (%s.%s)' % (lxcls.name, lxcls.__module__,
lxcls.__name__))
debug_lexer = False
# if profile:
# # does not work for e.g. ExtendedRegexLexers
# if lxcls.__bases__ == (RegexLexer,):
# # yes we can! (change the metaclass)
# lxcls.__class__ = ProfilingRegexLexerMeta
# lxcls.__bases__ = (ProfilingRegexLexer,)
# lxcls._prof_sort_index = profsort
# else:
# if lxcls.__bases__ == (RegexLexer,):
# lxcls.__bases__ = (DebuggingRegexLexer,)
# debug_lexer = True
# elif lxcls.__bases__ == (DebuggingRegexLexer,):
# # already debugged before
# debug_lexer = True
# else:
# # HACK: ExtendedRegexLexer subclasses will only partially work here.
# lxcls.__bases__ = (DebuggingRegexLexer,)
# debug_lexer = True
lx = lxcls(**options)
lno = 1
tokens = []
states = []
def show_token(tok, state):
reprs = list(map(repr, tok))
print(' ' + reprs[1] + ' ' + ' ' * (29-len(reprs[1])) + reprs[0], end=' ')
if debug_lexer:
print(' ' + ' ' * (29-len(reprs[0])) + ' : '.join(state)
if state else '', end=' ')
print()
for type, val in lx.get_tokens(text):
lno += val.count('\n')
if type == Error and not ignerror:
print('Error parsing', fn, 'on line', lno)
if not showall:
print('Previous tokens' + (debug_lexer and ' and states' or '') + ':')
for i in range(max(len(tokens) - num, 0), len(tokens)):
if debug_lexer:
show_token(tokens[i], states[i])
else:
show_token(tokens[i], None)
print('Error token:')
vlen = len(repr(val))
print(' ' + repr(val), end=' ')
if debug_lexer and hasattr(lx, 'ctx'):
print(' ' * (60-vlen) + ' : '.join(lx.ctx.stack), end=' ')
print()
print()
return 1
tokens.append((type, val))
if debug_lexer:
if hasattr(lx, 'ctx'):
states.append(lx.ctx.stack[:])
else:
states.append(None)
if showall:
show_token((type, val), states[-1] if debug_lexer else None)
return 0
def print_help():
print('''\
Pygments development helper to quickly debug lexers.
scripts/debug_lexer.py [options] file ...
Give one or more filenames to lex them and display possible error tokens
and/or profiling info. Files are assumed to be encoded in UTF-8.
Selecting lexer and options:
-l NAME use lexer named NAME (default is to guess from
the given filenames)
-g guess lexer from content
-u if input is non-utf8, use "ignore" handler instead
of using latin1 encoding
-U use Atheris fuzzer's method of converting
byte input to Unicode
-O OPTIONSTR use lexer options parsed from OPTIONSTR
Debugging lexing errors:
-n N show the last N tokens on error
-a always show all lexed tokens (default is only
to show them when an error occurs)
-e do not stop on error tokens
Profiling:
-p use the ProfilingRegexLexer to profile regexes
instead of the debugging lexer
-s N sort profiling output by column N (default is
column 4, the time per call)
''')
num = 10
showall = False
ignerror = False
lexer = None
options = {}
profile = False
profsort = 4
guess = False
decode_strategy = 'latin1'
if __name__ == '__main__':
import getopt
opts, args = getopt.getopt(sys.argv[1:], 'n:l:aepO:s:hguU')
for opt, val in opts:
if opt == '-n':
num = int(val)
elif opt == '-a':
showall = True
elif opt == '-e':
ignerror = True
elif opt == '-l':
lexer = val
elif opt == '-p':
profile = True
elif opt == '-s':
profsort = int(val)
elif opt == '-O':
options = _parse_options([val])
elif opt == '-g':
guess = True
elif opt == '-u':
decode_strategy = 'utf8-ignore'
elif opt == '-U':
decode_strategy = 'atheris'
elif opt == '-h':
print_help()
sys.exit(0)
ret = 0
if not args:
print_help()
for f in args:
ret += main(f, lexer, options)
sys.exit(bool(ret))