Source code for sdss_brain.helpers.parsing
# !/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Filename: parsing.py
# Project: helpers
# Author: Brian Cherinka
# Created: Wednesday, 7th October 2020 10:54:07 am
# License: BSD 3-clause "New" or "Revised" License
# Copyright (c) 2020 Brian Cherinka
# Last Modified: Wednesday, 7th October 2020 10:54:07 am
# Modified By: Brian Cherinka
from __future__ import print_function, division, absolute_import
import re
import pathlib
from typing import Union
from itertools import groupby
from sdss_brain import log
[docs]def create_object_pattern(regex: str = None, keys: list = None, keymap: dict = None,
delimiter: str = '-', exclude: list = None, include: list = None,
order: list = None) -> str:
""" Create a regex pattern to parse data input by
Parameters
----------
regex : str
A custom regex pattern
keys : list
A list of (access) names to build a pattern out of
keymap : dict
A dict of key name and pattern to build a pattern out of
delimiter : str
The delimiter to use when joining the keys. Default is "-".
exclude : list
A list of names to exclude from the keys
include : list
A list of names to only include from the keys
order : list
A list of names specifying the order in which to create the keyed pattern
Returns
-------
pattern : str
A regex pattern to use for parsing an objectid
"""
# use a custom regex pattern
if regex:
pattern = rf'(?P<objectid>(?![/$.])({regex}))'
return pattern
# if no keys or keymap, use a greedy default
if not keys and not keymap:
pattern = r'(?P<objectid>^[^/$.](.+)?)'
return pattern
assert keys or keymap, 'Either a list of keys or a keymap must be specified.'
assert isinstance(keys, (list, type(None))), 'keys must be a list'
assert isinstance(keymap, (dict, type(None))), 'keymap must be a dict'
if not keys and keymap:
keys = list(keymap.keys())
# make a copy of the original key order
keys_copy = keys.copy()
# exclude the named keys
if exclude:
keys = list(set(keys) - set(exclude))
# only include the named keys
if include or order:
good = order or include
keys = list(set(good) & set(keys))
# resort the keys by the original key order
keys.sort(key=lambda i: keys_copy.index(i))
# order the keys
if order:
keys.sort(key=lambda i: order.index(i))
patts = []
for k in keys:
if not keymap:
patts.append(fr'(?P<{k}>(.+)?)')
else:
patts.append(fr'(?P<{k}>{keymap[k]})')
# join into a single pattern
delimiter = '-' if not delimiter else delimiter
pattern = rf'(?P<objectid>(?![/$.])({delimiter.join(patts)}))'
return pattern
[docs]def parse_data_input(value: str, regex: str = None, keys: list = None, keymap: dict = None,
delimiter: str = '-', exclude: list = None, include: list = None,
order: list = None, inputs: bool = False) -> dict:
''' Parse data input for a filename or an object id
Parameters
----------
value : str
The input string to perform a pattern match on
regex : str
A custom regex pattern
keys : list
A list of (access) names to build a pattern out of
keymap : dict
A dict of key name and pattern to build a pattern out of
delimiter : str
The delimiter to use when joining the keys. Default is "-".
exclude : list
A list of names to exclude from the keys
include : list
A list of names to only include from the keys
order : list
A list of names specifying the order in which to create the keyed pattern
inputs : bool
If True, returns the parser inputs. Default is False.
Returns
-------
matches : dict
A dict with keys "filename", "objectid", and any other matches
Example
-------
>>> # parse a filename
>>> parse_data_input('/path/to/a/file.txt')
{'filename': '/path/to/a/file.txt', 'objectid': None, 'parsed_groups': None}
>>> # parse an objectid as is
>>> parse_data_input('8485-1901')
{'filename': None, 'objectid': '8485-1901', 'parsed_groups': ['8485-1901', '485-1901']}
>>> # parse an objectid using a custom pattern
>>> parse_data_input('8485-1901', regex=r'(?P<plate>\d{4,5})-(?P<ifu>\d{3,5})')
{'filename': None, 'objectid': '8485-1901', 'plate': '8485', 'ifu': '1901', 'parsed_groups': ['8485-1901', '8485', '1901']}
>>> # parse an objectid using access keywords
>>> keys=['drpver', 'plate', 'ifu', 'wave']
>>> parse_data_input('v1-8485-1901-LOG', keys=keys)
{'filename': None, 'objectid': 'v1-8485-1901-LOG', 'drpver': 'v1', 'plate': '8485',
'ifu': '1901', 'wave': 'LOG', 'parsed_groups': ['v1-8485-1901-LOG', 'v1', '8485', '1901', 'LOG']}
>>> # parse an objectid specifying the input order of the keys
>>> parse_data_input('8485-1901', keys=keys, order=['plate', 'ifu'])
{'filename': None, 'objectid': '8485-1901', 'plate': '8485', 'ifu': '1901', 'parsed_groups': ['8485-1901', '8485', '1901']}
'''
assert isinstance(value, (str, pathlib.Path)), 'input value must be a str or pathlib.Path'
# check if regex has named groups
# is_named = re.findall(r'\?P<(.*?)>', regex) if regex else None
# set default file pattern
file_pattern = r'(?P<filename>^[/$.](.+)?(.[a-z]+))'
# create an object id regex pattern using a specified pattern or generate a default one
obj_pattern = create_object_pattern(regex=regex, keys=keys, keymap=keymap, delimiter=delimiter,
exclude=exclude, include=include, order=order)
# final pattern
pattern = fr'^{file_pattern}|{obj_pattern}$'
# compile and match the patterm
comp_pattern = re.compile(pattern)
pattern_match = re.match(comp_pattern, str(value))
# if no match, assume value is a filename and return nothing
if not pattern_match:
log.warning('No pattern match found. Defaulting to input value as a filename.')
return {'filename': value}
# check for named group, then any groups, then a match without groups
matches = pattern_match.groupdict() or pattern_match.groups() or pattern_match.group()
# add the groups to a new key (remove None and duplicate values)
matches['parsed_groups'] = [k for k, _ in groupby(sorted(pattern_match.groups(),
key=lambda x: pattern_match.groups().index(x))) if k] \
if not matches.get('filename', None) else None
# store the parser inputs
if inputs:
matches['parsed_inputs'] = {'pattern': pattern, 'input_regex': regex,
'object_pattern': obj_pattern, 'file_pattern': file_pattern}
return matches
[docs]def raw_parse(value: str, regex: str = None) -> Union[dict, tuple]:
''' Match a string via a regex pattern with no frills
Parameters
----------
value : str
The input string to match on
regex : str
The regex pattern to use for matching
Returns
-------
A matched group
'''
pattern = re.compile(regex)
match = re.match(pattern, value)
if not match:
return None
return match.groupdict() or match.groups() or match.group()
