trying to make dictionary using re.verbose

373 views Asked by At

I'm trying to write a pattern to make a dictionary like :

string= '30.95.91.251 - larson8319 [21/Jun/2019:16:02:02 -0700] "PUT /one-to-one/whiteboard HTTP/1.0" 401 7270'

to look like:

dic= {"host":"30.95.91.251", 
      "user_name":"larson8319", 
      "time":"21/Jun/2019:16:02:02 -0700",
      "request":"PUT /one-to-one/whiteboard HTTP/1.0"}

using this code:

pattern = '''
(?P<host>.*)
(-\ )
(?P<user_name>\w*)
(?P<time>\W.+)
(?P<request>\w+)
'''
for item in re.finditer(pattern, logdata, re.VERBOSE):
    print(item.groupdict())'

but I couldn’t make the brackets disappear and order the request part.

1

There are 1 answers

0
Jan On

Be more specific and use character classes ([...]):

(?P<host>[\d.]+)[-\s]+
(?P<user_name>\w+)\s+
\[(?P<time>[^][]+)\]\s+
"(?P<request>[^"]+)"

See a demo on regex101.com.


Or - use a parser altogether:

from parsimonious.grammar import Grammar
from parsimonious.nodes import NodeVisitor

string = '30.95.91.251 - larson8319 [21/Jun/2019:16:02:02 -0700] "PUT /one-to-one/whiteboard HTTP/1.0" 401 7270'


class LogVisitor(NodeVisitor):
    grammar = Grammar(
        r"""
        entry   = ip user time request rest
        ip      = ~"[\d.]+" junk
        user    = ~"\w+" junk
        time    = "[" ~"[^][]+" "]" junk
        request = '"' ~"[^\"]+" '"'
        junk    = ~"[-\s]*"
        rest    = ~".*"
        """
    )

    def generic_visit(self, node, visited_children):
        return visited_children or node

    def visit_entry(self, node, visited_children):
        ip, user, time, request, *_ = visited_children
        return dict([ip, user, time, request])

    def __clean__(self, node, visited_children, first=False):
        if first:
            _, what, *_ = visited_children
        else:
            what, *_ = visited_children
        return what.text

    def visit_ip(self, node, visited_children):
        return ('ip', self.__clean__(node, visited_children))

    def visit_user(self, node, visited_children):
        return ('user', self.__clean__(node, visited_children))

    def visit_time(self, node, visited_children):
        return ('time', self.__clean__(node, visited_children, True))

    def visit_request(self, node, visited_children):
        return ('request', self.__clean__(node, visited_children, True))


lv = LogVisitor()
result = lv.parse(string)
print(result)

Which would yield

{'ip': '30.95.91.251', 'user': 'larson8319', 'time': '21/Jun/2019:16:02:02 -0700', 'request': 'PUT /one-to-one/whiteboard HTTP/1.0'}