Source code for parsemon.parser

"""This module contains the basic building blocks for implementing parsers"""

from dataclasses import dataclass
from functools import reduce
from typing import Any, List, Type, TypeVar

from .coroutine import do
from .error import FileTooLarge, ParsingFailed
from .internals import bind, choose_parser, one_of, run, try_parser, unit
from .sourcemap import (
    display_location,
    find_linebreak_indices,
    find_location_in_indices,
)
from .stream import Stream, StringStream

T = TypeVar("T")


[docs]@dataclass class ParsingResult: value: Any remaining_input: str
def parsing_result(value, remaining_input): return ParsingResult(value, remaining_input) _NO_FURTHER_RESULT = object() """This is only intended for internal use. We use _NO_FURTHER_RESULT to signal that a parser was not able to yield a result. We could use None here but then we would not be able to work with parsers that would actually return None as a positive parsing result. """ _DELIMITER_TOKEN = object() """This is only intended for internal use. Its primary use is for cases where we want to parse, until we find some form of delimiter. """
[docs]def chain(first, second, *rest): """Combine to parsers and only use the result of the second parser :param first: a parser that consumes input, the result will be discarded :param second: a parser that is applied after first, the result of this parser will be returned by the resulting parser """ def _chain(p1, p2): return bind(p1, lambda _: p2) first_and_second_parser_combined = _chain(first, second) return reduce(_chain, rest, first_and_second_parser_combined)
[docs]def choice( first_parser, second_parser, ): """Try one parser and try a second one if the first one fails""" return choose_parser(first_parser, second_parser)
[docs]def choices(parser, *parsers): """Try the given parsers one at a time until one succeeds""" return reduce(choice, [parser] + list(parsers))
[docs]@do def many(original_parser): """Apply a parser 0 or more times The resulting parser is greedy, which means that it will be applied as often as possible, which also includes 0 times. Think of this as Kleene-Star. :param original_parser: this parser will be applied as often as possible by the resulting new parser """ results = [] while True: current_result = yield choice( try_parser(original_parser), unit(_NO_FURTHER_RESULT) ) if current_result is _NO_FURTHER_RESULT: break else: results.append(current_result) return results
[docs]@do def many1(original_parser): """Apply a parser 1 or more times The resulting parser is greedy, which means that it will be applied as often as possible. Think of this as an equivalent to the '+' operator in regular expressions. original_parser -- this parser will be applied 1 or more times by the resulting parser """ return [(yield original_parser)] + (yield many(original_parser))
[docs]@do def seperated_by(parser, seperator): """Apply the input parser as often as possible, where occurences are seperated by input that can be parsed by 'seperator'. This can be useful to parse lists with seperators in between. The parser ``seperated_by(many(none_of(',')), literal(','))`` will parse the string ``1,2,3,4`` and return the list ``['1','2','3','4']``. """ results: List[Any] = [] first_elem = yield choice(try_parser(parser), unit(_NO_FURTHER_RESULT)) if first_elem is _NO_FURTHER_RESULT: return results rest_elems = yield many(chain(seperator, parser)) return [first_elem] + rest_elems
[docs]@do def enclosed_by( parser, prefix_parser, suffix_parser=None, ): """Parse a string enclosed by delimeters The parser ``enclosed_by(many(none_of('"')),literal('"'))`` will consume the string ``"example"`` and return the python string ``'example'``. """ yield prefix_parser result = yield parser yield suffix_parser or prefix_parser return result
[docs]def run_parser( p, input_string: str, stream_implementation: Type[Stream] = StringStream, ): """Parse string input_string with parser p""" def render_failure(failure): linebreaks = find_linebreak_indices(input_string) line, column = find_location_in_indices(failure.position, linebreaks) return "{message} @ {location}".format( message=failure.message, location=display_location(line, column) ) stream, result = run(p, StringStream.from_string(input_string)) if result.is_failure(): failures = result.get_failures() final_message = " OR ".join(map(render_failure, failures)) raise ParsingFailed(final_message) else: return parsing_result(value=result.value, remaining_input=stream.to_string())
[docs]def parse_file(parser, input_file, max_size=None): content = input_file.read(max_size) if input_file.read(1): raise FileTooLarge("File to be parsed exceeded maximum size") return run_parser(parser, content)
whitespace_unicode_characters_decimals: List[int] = [ 9, 10, 11, 12, 13, 32, 133, 160, 5760, 8192, 8193, 8194, 8195, 8196, 8197, 8198, 8199, 8200, 8201, 8202, 8232, 8233, 8239, 8287, 12288, ] whitespace = one_of("".join(map(chr, whitespace_unicode_characters_decimals))) """Parse any character that is classified as a whitespace character by unicode standard. That includes newline characters."""
[docs]@do def until(repeating_parser, delimiter_parser): """Parse `repeating_parser` until `delimiter_parser` is found. `delimiter_parser` is has always precedence over `repeating_parser`. You can think about it this way: First we check if `delimiter_parser` matches the input succesfully, if this is not the case, we try `repeating_parser`. If `repeating_parser` fails to match, then `until(repeating_parser, delimiter_parser)` as a whole fails. If `repeating_parser` matches, then we start over again. When `delimiter_parser` matches eventually, we return all results of `repeating_parser` in a list. Note that both, `delimiter_parser` and `repeating_parser` consume input. This is especially important if both parser have overlap on the characters they consume, e.g. `until(character(), literal('end'))`. Make use of `try_parser` and `look_ahead` as you see fit for your usecase. """ found_elements = [] while True: result = yield choice( chain(delimiter_parser, unit(_DELIMITER_TOKEN)), repeating_parser ) if result is _DELIMITER_TOKEN: break else: found_elements.append(result) return tuple(found_elements)
[docs]@do def repeat(parser, count): results = [] for _ in range(count): results.append((yield parser)) return results