425 lines
13 KiB
Python
425 lines
13 KiB
Python
# common.py
|
|
from .core import *
|
|
from .helpers import delimited_list, any_open_tag, any_close_tag
|
|
from datetime import datetime
|
|
|
|
|
|
# some other useful expressions - using lower-case class name since we are really using this as a namespace
|
|
class pyparsing_common:
|
|
"""Here are some common low-level expressions that may be useful in
|
|
jump-starting parser development:
|
|
|
|
- numeric forms (:class:`integers<integer>`, :class:`reals<real>`,
|
|
:class:`scientific notation<sci_real>`)
|
|
- common :class:`programming identifiers<identifier>`
|
|
- network addresses (:class:`MAC<mac_address>`,
|
|
:class:`IPv4<ipv4_address>`, :class:`IPv6<ipv6_address>`)
|
|
- ISO8601 :class:`dates<iso8601_date>` and
|
|
:class:`datetime<iso8601_datetime>`
|
|
- :class:`UUID<uuid>`
|
|
- :class:`comma-separated list<comma_separated_list>`
|
|
- :class:`url`
|
|
|
|
Parse actions:
|
|
|
|
- :class:`convertToInteger`
|
|
- :class:`convertToFloat`
|
|
- :class:`convertToDate`
|
|
- :class:`convertToDatetime`
|
|
- :class:`stripHTMLTags`
|
|
- :class:`upcaseTokens`
|
|
- :class:`downcaseTokens`
|
|
|
|
Example::
|
|
|
|
pyparsing_common.number.runTests('''
|
|
# any int or real number, returned as the appropriate type
|
|
100
|
|
-100
|
|
+100
|
|
3.14159
|
|
6.02e23
|
|
1e-12
|
|
''')
|
|
|
|
pyparsing_common.fnumber.runTests('''
|
|
# any int or real number, returned as float
|
|
100
|
|
-100
|
|
+100
|
|
3.14159
|
|
6.02e23
|
|
1e-12
|
|
''')
|
|
|
|
pyparsing_common.hex_integer.runTests('''
|
|
# hex numbers
|
|
100
|
|
FF
|
|
''')
|
|
|
|
pyparsing_common.fraction.runTests('''
|
|
# fractions
|
|
1/2
|
|
-3/4
|
|
''')
|
|
|
|
pyparsing_common.mixed_integer.runTests('''
|
|
# mixed fractions
|
|
1
|
|
1/2
|
|
-3/4
|
|
1-3/4
|
|
''')
|
|
|
|
import uuid
|
|
pyparsing_common.uuid.setParseAction(tokenMap(uuid.UUID))
|
|
pyparsing_common.uuid.runTests('''
|
|
# uuid
|
|
12345678-1234-5678-1234-567812345678
|
|
''')
|
|
|
|
prints::
|
|
|
|
# any int or real number, returned as the appropriate type
|
|
100
|
|
[100]
|
|
|
|
-100
|
|
[-100]
|
|
|
|
+100
|
|
[100]
|
|
|
|
3.14159
|
|
[3.14159]
|
|
|
|
6.02e23
|
|
[6.02e+23]
|
|
|
|
1e-12
|
|
[1e-12]
|
|
|
|
# any int or real number, returned as float
|
|
100
|
|
[100.0]
|
|
|
|
-100
|
|
[-100.0]
|
|
|
|
+100
|
|
[100.0]
|
|
|
|
3.14159
|
|
[3.14159]
|
|
|
|
6.02e23
|
|
[6.02e+23]
|
|
|
|
1e-12
|
|
[1e-12]
|
|
|
|
# hex numbers
|
|
100
|
|
[256]
|
|
|
|
FF
|
|
[255]
|
|
|
|
# fractions
|
|
1/2
|
|
[0.5]
|
|
|
|
-3/4
|
|
[-0.75]
|
|
|
|
# mixed fractions
|
|
1
|
|
[1]
|
|
|
|
1/2
|
|
[0.5]
|
|
|
|
-3/4
|
|
[-0.75]
|
|
|
|
1-3/4
|
|
[1.75]
|
|
|
|
# uuid
|
|
12345678-1234-5678-1234-567812345678
|
|
[UUID('12345678-1234-5678-1234-567812345678')]
|
|
"""
|
|
|
|
convert_to_integer = token_map(int)
|
|
"""
|
|
Parse action for converting parsed integers to Python int
|
|
"""
|
|
|
|
convert_to_float = token_map(float)
|
|
"""
|
|
Parse action for converting parsed numbers to Python float
|
|
"""
|
|
|
|
integer = Word(nums).set_name("integer").set_parse_action(convert_to_integer)
|
|
"""expression that parses an unsigned integer, returns an int"""
|
|
|
|
hex_integer = (
|
|
Word(hexnums).set_name("hex integer").set_parse_action(token_map(int, 16))
|
|
)
|
|
"""expression that parses a hexadecimal integer, returns an int"""
|
|
|
|
signed_integer = (
|
|
Regex(r"[+-]?\d+")
|
|
.set_name("signed integer")
|
|
.set_parse_action(convert_to_integer)
|
|
)
|
|
"""expression that parses an integer with optional leading sign, returns an int"""
|
|
|
|
fraction = (
|
|
signed_integer().set_parse_action(convert_to_float)
|
|
+ "/"
|
|
+ signed_integer().set_parse_action(convert_to_float)
|
|
).set_name("fraction")
|
|
"""fractional expression of an integer divided by an integer, returns a float"""
|
|
fraction.add_parse_action(lambda tt: tt[0] / tt[-1])
|
|
|
|
mixed_integer = (
|
|
fraction | signed_integer + Opt(Opt("-").suppress() + fraction)
|
|
).set_name("fraction or mixed integer-fraction")
|
|
"""mixed integer of the form 'integer - fraction', with optional leading integer, returns float"""
|
|
mixed_integer.add_parse_action(sum)
|
|
|
|
real = (
|
|
Regex(r"[+-]?(?:\d+\.\d*|\.\d+)")
|
|
.set_name("real number")
|
|
.set_parse_action(convert_to_float)
|
|
)
|
|
"""expression that parses a floating point number and returns a float"""
|
|
|
|
sci_real = (
|
|
Regex(r"[+-]?(?:\d+(?:[eE][+-]?\d+)|(?:\d+\.\d*|\.\d+)(?:[eE][+-]?\d+)?)")
|
|
.set_name("real number with scientific notation")
|
|
.set_parse_action(convert_to_float)
|
|
)
|
|
"""expression that parses a floating point number with optional
|
|
scientific notation and returns a float"""
|
|
|
|
# streamlining this expression makes the docs nicer-looking
|
|
number = (sci_real | real | signed_integer).setName("number").streamline()
|
|
"""any numeric expression, returns the corresponding Python type"""
|
|
|
|
fnumber = (
|
|
Regex(r"[+-]?\d+\.?\d*([eE][+-]?\d+)?")
|
|
.set_name("fnumber")
|
|
.set_parse_action(convert_to_float)
|
|
)
|
|
"""any int or real number, returned as float"""
|
|
|
|
identifier = Word(identchars, identbodychars).set_name("identifier")
|
|
"""typical code identifier (leading alpha or '_', followed by 0 or more alphas, nums, or '_')"""
|
|
|
|
ipv4_address = Regex(
|
|
r"(25[0-5]|2[0-4][0-9]|1?[0-9]{1,2})(\.(25[0-5]|2[0-4][0-9]|1?[0-9]{1,2})){3}"
|
|
).set_name("IPv4 address")
|
|
"IPv4 address (``0.0.0.0 - 255.255.255.255``)"
|
|
|
|
_ipv6_part = Regex(r"[0-9a-fA-F]{1,4}").set_name("hex_integer")
|
|
_full_ipv6_address = (_ipv6_part + (":" + _ipv6_part) * 7).set_name(
|
|
"full IPv6 address"
|
|
)
|
|
_short_ipv6_address = (
|
|
Opt(_ipv6_part + (":" + _ipv6_part) * (0, 6))
|
|
+ "::"
|
|
+ Opt(_ipv6_part + (":" + _ipv6_part) * (0, 6))
|
|
).set_name("short IPv6 address")
|
|
_short_ipv6_address.add_condition(
|
|
lambda t: sum(1 for tt in t if pyparsing_common._ipv6_part.matches(tt)) < 8
|
|
)
|
|
_mixed_ipv6_address = ("::ffff:" + ipv4_address).set_name("mixed IPv6 address")
|
|
ipv6_address = Combine(
|
|
(_full_ipv6_address | _mixed_ipv6_address | _short_ipv6_address).set_name(
|
|
"IPv6 address"
|
|
)
|
|
).set_name("IPv6 address")
|
|
"IPv6 address (long, short, or mixed form)"
|
|
|
|
mac_address = Regex(
|
|
r"[0-9a-fA-F]{2}([:.-])[0-9a-fA-F]{2}(?:\1[0-9a-fA-F]{2}){4}"
|
|
).set_name("MAC address")
|
|
"MAC address xx:xx:xx:xx:xx (may also have '-' or '.' delimiters)"
|
|
|
|
@staticmethod
|
|
def convert_to_date(fmt: str = "%Y-%m-%d"):
|
|
"""
|
|
Helper to create a parse action for converting parsed date string to Python datetime.date
|
|
|
|
Params -
|
|
- fmt - format to be passed to datetime.strptime (default= ``"%Y-%m-%d"``)
|
|
|
|
Example::
|
|
|
|
date_expr = pyparsing_common.iso8601_date.copy()
|
|
date_expr.setParseAction(pyparsing_common.convertToDate())
|
|
print(date_expr.parseString("1999-12-31"))
|
|
|
|
prints::
|
|
|
|
[datetime.date(1999, 12, 31)]
|
|
"""
|
|
|
|
def cvt_fn(ss, ll, tt):
|
|
try:
|
|
return datetime.strptime(tt[0], fmt).date()
|
|
except ValueError as ve:
|
|
raise ParseException(ss, ll, str(ve))
|
|
|
|
return cvt_fn
|
|
|
|
@staticmethod
|
|
def convert_to_datetime(fmt: str = "%Y-%m-%dT%H:%M:%S.%f"):
|
|
"""Helper to create a parse action for converting parsed
|
|
datetime string to Python datetime.datetime
|
|
|
|
Params -
|
|
- fmt - format to be passed to datetime.strptime (default= ``"%Y-%m-%dT%H:%M:%S.%f"``)
|
|
|
|
Example::
|
|
|
|
dt_expr = pyparsing_common.iso8601_datetime.copy()
|
|
dt_expr.setParseAction(pyparsing_common.convertToDatetime())
|
|
print(dt_expr.parseString("1999-12-31T23:59:59.999"))
|
|
|
|
prints::
|
|
|
|
[datetime.datetime(1999, 12, 31, 23, 59, 59, 999000)]
|
|
"""
|
|
|
|
def cvt_fn(s, l, t):
|
|
try:
|
|
return datetime.strptime(t[0], fmt)
|
|
except ValueError as ve:
|
|
raise ParseException(s, l, str(ve))
|
|
|
|
return cvt_fn
|
|
|
|
iso8601_date = Regex(
|
|
r"(?P<year>\d{4})(?:-(?P<month>\d\d)(?:-(?P<day>\d\d))?)?"
|
|
).set_name("ISO8601 date")
|
|
"ISO8601 date (``yyyy-mm-dd``)"
|
|
|
|
iso8601_datetime = Regex(
|
|
r"(?P<year>\d{4})-(?P<month>\d\d)-(?P<day>\d\d)[T ](?P<hour>\d\d):(?P<minute>\d\d)(:(?P<second>\d\d(\.\d*)?)?)?(?P<tz>Z|[+-]\d\d:?\d\d)?"
|
|
).set_name("ISO8601 datetime")
|
|
"ISO8601 datetime (``yyyy-mm-ddThh:mm:ss.s(Z|+-00:00)``) - trailing seconds, milliseconds, and timezone optional; accepts separating ``'T'`` or ``' '``"
|
|
|
|
uuid = Regex(r"[0-9a-fA-F]{8}(-[0-9a-fA-F]{4}){3}-[0-9a-fA-F]{12}").set_name("UUID")
|
|
"UUID (``xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx``)"
|
|
|
|
_html_stripper = any_open_tag.suppress() | any_close_tag.suppress()
|
|
|
|
@staticmethod
|
|
def strip_html_tags(s: str, l: int, tokens: ParseResults):
|
|
"""Parse action to remove HTML tags from web page HTML source
|
|
|
|
Example::
|
|
|
|
# strip HTML links from normal text
|
|
text = '<td>More info at the <a href="https://github.com/pyparsing/pyparsing/wiki">pyparsing</a> wiki page</td>'
|
|
td, td_end = makeHTMLTags("TD")
|
|
table_text = td + SkipTo(td_end).setParseAction(pyparsing_common.stripHTMLTags)("body") + td_end
|
|
print(table_text.parseString(text).body)
|
|
|
|
Prints::
|
|
|
|
More info at the pyparsing wiki page
|
|
"""
|
|
return pyparsing_common._html_stripper.transform_string(tokens[0])
|
|
|
|
_commasepitem = (
|
|
Combine(
|
|
OneOrMore(
|
|
~Literal(",")
|
|
+ ~LineEnd()
|
|
+ Word(printables, exclude_chars=",")
|
|
+ Opt(White(" \t") + ~FollowedBy(LineEnd() | ","))
|
|
)
|
|
)
|
|
.streamline()
|
|
.set_name("commaItem")
|
|
)
|
|
comma_separated_list = delimited_list(
|
|
Opt(quoted_string.copy() | _commasepitem, default="")
|
|
).set_name("comma separated list")
|
|
"""Predefined expression of 1 or more printable words or quoted strings, separated by commas."""
|
|
|
|
upcase_tokens = staticmethod(token_map(lambda t: t.upper()))
|
|
"""Parse action to convert tokens to upper case."""
|
|
|
|
downcase_tokens = staticmethod(token_map(lambda t: t.lower()))
|
|
"""Parse action to convert tokens to lower case."""
|
|
|
|
# fmt: off
|
|
url = Regex(
|
|
# https://mathiasbynens.be/demo/url-regex
|
|
# https://gist.github.com/dperini/729294
|
|
r"^" +
|
|
# protocol identifier (optional)
|
|
# short syntax // still required
|
|
r"(?:(?:(?P<scheme>https?|ftp):)?\/\/)" +
|
|
# user:pass BasicAuth (optional)
|
|
r"(?:(?P<auth>\S+(?::\S*)?)@)?" +
|
|
r"(?P<host>" +
|
|
# IP address exclusion
|
|
# private & local networks
|
|
r"(?!(?:10|127)(?:\.\d{1,3}){3})" +
|
|
r"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})" +
|
|
r"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})" +
|
|
# IP address dotted notation octets
|
|
# excludes loopback network 0.0.0.0
|
|
# excludes reserved space >= 224.0.0.0
|
|
# excludes network & broadcast addresses
|
|
# (first & last IP address of each class)
|
|
r"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])" +
|
|
r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}" +
|
|
r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))" +
|
|
r"|" +
|
|
# host & domain names, may end with dot
|
|
# can be replaced by a shortest alternative
|
|
# (?![-_])(?:[-\w\u00a1-\uffff]{0,63}[^-_]\.)+
|
|
r"(?:" +
|
|
r"(?:" +
|
|
r"[a-z0-9\u00a1-\uffff]" +
|
|
r"[a-z0-9\u00a1-\uffff_-]{0,62}" +
|
|
r")?" +
|
|
r"[a-z0-9\u00a1-\uffff]\." +
|
|
r")+" +
|
|
# TLD identifier name, may end with dot
|
|
r"(?:[a-z\u00a1-\uffff]{2,}\.?)" +
|
|
r")" +
|
|
# port number (optional)
|
|
r"(:(?P<port>\d{2,5}))?" +
|
|
# resource path (optional)
|
|
r"(?P<path>\/[^?# ]*)?" +
|
|
# query string (optional)
|
|
r"(\?(?P<query>[^#]*))?" +
|
|
# fragment (optional)
|
|
r"(#(?P<fragment>\S*))?" +
|
|
r"$"
|
|
).set_name("url")
|
|
# fmt: on
|
|
|
|
# pre-PEP8 compatibility names
|
|
convertToInteger = convert_to_integer
|
|
convertToFloat = convert_to_float
|
|
convertToDate = convert_to_date
|
|
convertToDatetime = convert_to_datetime
|
|
stripHTMLTags = strip_html_tags
|
|
upcaseTokens = upcase_tokens
|
|
downcaseTokens = downcase_tokens
|
|
|
|
|
|
_builtin_exprs = [
|
|
v for v in vars(pyparsing_common).values() if isinstance(v, ParserElement)
|
|
]
|