422 lines
12 KiB
Python
422 lines
12 KiB
Python
|
"""
|
||
|
This module implements Git's wildmatch pattern matching which itself is derived
|
||
|
from Rsync's wildmatch. Git uses wildmatch for its ".gitignore" files.
|
||
|
"""
|
||
|
|
||
|
import re
|
||
|
import warnings
|
||
|
from typing import (
|
||
|
AnyStr,
|
||
|
Optional, # Replaced by `X | None` in 3.10.
|
||
|
Tuple) # Replaced by `tuple` in 3.9.
|
||
|
|
||
|
from .. import util
|
||
|
from ..pattern import RegexPattern
|
||
|
|
||
|
_BYTES_ENCODING = 'latin1'
|
||
|
"""
|
||
|
The encoding to use when parsing a byte string pattern.
|
||
|
"""
|
||
|
|
||
|
_DIR_MARK = 'ps_d'
|
||
|
"""
|
||
|
The regex group name for the directory marker. This is only used by
|
||
|
:class:`GitIgnoreSpec`.
|
||
|
"""
|
||
|
|
||
|
|
||
|
class GitWildMatchPatternError(ValueError):
|
||
|
"""
|
||
|
The :class:`GitWildMatchPatternError` indicates an invalid git wild match
|
||
|
pattern.
|
||
|
"""
|
||
|
pass
|
||
|
|
||
|
|
||
|
class GitWildMatchPattern(RegexPattern):
|
||
|
"""
|
||
|
The :class:`GitWildMatchPattern` class represents a compiled Git wildmatch
|
||
|
pattern.
|
||
|
"""
|
||
|
|
||
|
# Keep the dict-less class hierarchy.
|
||
|
__slots__ = ()
|
||
|
|
||
|
@classmethod
|
||
|
def pattern_to_regex(
|
||
|
cls,
|
||
|
pattern: AnyStr,
|
||
|
) -> Tuple[Optional[AnyStr], Optional[bool]]:
|
||
|
"""
|
||
|
Convert the pattern into a regular expression.
|
||
|
|
||
|
*pattern* (:class:`str` or :class:`bytes`) is the pattern to convert into a
|
||
|
regular expression.
|
||
|
|
||
|
Returns the uncompiled regular expression (:class:`str`, :class:`bytes`, or
|
||
|
:data:`None`); and whether matched files should be included (:data:`True`),
|
||
|
excluded (:data:`False`), or if it is a null-operation (:data:`None`).
|
||
|
"""
|
||
|
if isinstance(pattern, str):
|
||
|
return_type = str
|
||
|
elif isinstance(pattern, bytes):
|
||
|
return_type = bytes
|
||
|
pattern = pattern.decode(_BYTES_ENCODING)
|
||
|
else:
|
||
|
raise TypeError(f"pattern:{pattern!r} is not a unicode or byte string.")
|
||
|
|
||
|
original_pattern = pattern
|
||
|
|
||
|
if pattern.endswith('\\ '):
|
||
|
# EDGE CASE: Spaces can be escaped with backslash. If a pattern that ends
|
||
|
# with backslash followed by a space, only strip from left.
|
||
|
pattern = pattern.lstrip()
|
||
|
else:
|
||
|
pattern = pattern.strip()
|
||
|
|
||
|
if pattern.startswith('#'):
|
||
|
# A pattern starting with a hash ('#') serves as a comment (neither
|
||
|
# includes nor excludes files). Escape the hash with a back-slash to match
|
||
|
# a literal hash (i.e., '\#').
|
||
|
regex = None
|
||
|
include = None
|
||
|
|
||
|
elif pattern == '/':
|
||
|
# EDGE CASE: According to `git check-ignore` (v2.4.1), a single '/' does
|
||
|
# not match any file.
|
||
|
regex = None
|
||
|
include = None
|
||
|
|
||
|
elif pattern:
|
||
|
if pattern.startswith('!'):
|
||
|
# A pattern starting with an exclamation mark ('!') negates the pattern
|
||
|
# (exclude instead of include). Escape the exclamation mark with a
|
||
|
# back-slash to match a literal exclamation mark (i.e., '\!').
|
||
|
include = False
|
||
|
# Remove leading exclamation mark.
|
||
|
pattern = pattern[1:]
|
||
|
else:
|
||
|
include = True
|
||
|
|
||
|
# Allow a regex override for edge cases that cannot be handled through
|
||
|
# normalization.
|
||
|
override_regex = None
|
||
|
|
||
|
# Split pattern into segments.
|
||
|
pattern_segs = pattern.split('/')
|
||
|
|
||
|
# Check whether the pattern is specifically a directory pattern before
|
||
|
# normalization.
|
||
|
is_dir_pattern = not pattern_segs[-1]
|
||
|
|
||
|
# Normalize pattern to make processing easier.
|
||
|
|
||
|
# EDGE CASE: Deal with duplicate double-asterisk sequences. Collapse each
|
||
|
# sequence down to one double-asterisk. Iterate over the segments in
|
||
|
# reverse and remove the duplicate double asterisks as we go.
|
||
|
for i in range(len(pattern_segs) - 1, 0, -1):
|
||
|
prev = pattern_segs[i-1]
|
||
|
seg = pattern_segs[i]
|
||
|
if prev == '**' and seg == '**':
|
||
|
del pattern_segs[i]
|
||
|
|
||
|
if len(pattern_segs) == 2 and pattern_segs[0] == '**' and not pattern_segs[1]:
|
||
|
# EDGE CASE: The '**/' pattern should match everything except individual
|
||
|
# files in the root directory. This case cannot be adequately handled
|
||
|
# through normalization. Use the override.
|
||
|
override_regex = f'^.+(?P<{_DIR_MARK}>/).*$'
|
||
|
|
||
|
if not pattern_segs[0]:
|
||
|
# A pattern beginning with a slash ('/') will only match paths directly
|
||
|
# on the root directory instead of any descendant paths. So, remove
|
||
|
# empty first segment to make pattern relative to root.
|
||
|
del pattern_segs[0]
|
||
|
|
||
|
elif len(pattern_segs) == 1 or (len(pattern_segs) == 2 and not pattern_segs[1]):
|
||
|
# A single pattern without a beginning slash ('/') will match any
|
||
|
# descendant path. This is equivalent to "**/{pattern}". So, prepend
|
||
|
# with double-asterisks to make pattern relative to root.
|
||
|
# - EDGE CASE: This also holds for a single pattern with a trailing
|
||
|
# slash (e.g. dir/).
|
||
|
if pattern_segs[0] != '**':
|
||
|
pattern_segs.insert(0, '**')
|
||
|
|
||
|
else:
|
||
|
# EDGE CASE: A pattern without a beginning slash ('/') but contains at
|
||
|
# least one prepended directory (e.g. "dir/{pattern}") should not match
|
||
|
# "**/dir/{pattern}", according to `git check-ignore` (v2.4.1).
|
||
|
pass
|
||
|
|
||
|
if not pattern_segs:
|
||
|
# After resolving the edge cases, we end up with no pattern at all. This
|
||
|
# must be because the pattern is invalid.
|
||
|
raise GitWildMatchPatternError(f"Invalid git pattern: {original_pattern!r}")
|
||
|
|
||
|
if not pattern_segs[-1] and len(pattern_segs) > 1:
|
||
|
# A pattern ending with a slash ('/') will match all descendant paths if
|
||
|
# it is a directory but not if it is a regular file. This is equivalent
|
||
|
# to "{pattern}/**". So, set last segment to a double-asterisk to
|
||
|
# include all descendants.
|
||
|
pattern_segs[-1] = '**'
|
||
|
|
||
|
if override_regex is None:
|
||
|
# Build regular expression from pattern.
|
||
|
output = ['^']
|
||
|
need_slash = False
|
||
|
end = len(pattern_segs) - 1
|
||
|
for i, seg in enumerate(pattern_segs):
|
||
|
if seg == '**':
|
||
|
if i == 0 and i == end:
|
||
|
# A pattern consisting solely of double-asterisks ('**') will
|
||
|
# match every path.
|
||
|
output.append(f'[^/]+(?:/.*)?')
|
||
|
|
||
|
elif i == 0:
|
||
|
# A normalized pattern beginning with double-asterisks
|
||
|
# ('**') will match any leading path segments.
|
||
|
output.append('(?:.+/)?')
|
||
|
need_slash = False
|
||
|
|
||
|
elif i == end:
|
||
|
# A normalized pattern ending with double-asterisks ('**') will
|
||
|
# match any trailing path segments.
|
||
|
if is_dir_pattern:
|
||
|
output.append(f'(?P<{_DIR_MARK}>/).*')
|
||
|
else:
|
||
|
output.append(f'/.*')
|
||
|
|
||
|
else:
|
||
|
# A pattern with inner double-asterisks ('**') will match multiple
|
||
|
# (or zero) inner path segments.
|
||
|
output.append('(?:/.+)?')
|
||
|
need_slash = True
|
||
|
|
||
|
elif seg == '*':
|
||
|
# Match single path segment.
|
||
|
if need_slash:
|
||
|
output.append('/')
|
||
|
|
||
|
output.append('[^/]+')
|
||
|
|
||
|
if i == end:
|
||
|
# A pattern ending without a slash ('/') will match a file or a
|
||
|
# directory (with paths underneath it). E.g., "foo" matches "foo",
|
||
|
# "foo/bar", "foo/bar/baz", etc.
|
||
|
output.append(f'(?:(?P<{_DIR_MARK}>/).*)?')
|
||
|
|
||
|
need_slash = True
|
||
|
|
||
|
else:
|
||
|
# Match segment glob pattern.
|
||
|
if need_slash:
|
||
|
output.append('/')
|
||
|
|
||
|
try:
|
||
|
output.append(cls._translate_segment_glob(seg))
|
||
|
except ValueError as e:
|
||
|
raise GitWildMatchPatternError(f"Invalid git pattern: {original_pattern!r}") from e
|
||
|
|
||
|
if i == end:
|
||
|
# A pattern ending without a slash ('/') will match a file or a
|
||
|
# directory (with paths underneath it). E.g., "foo" matches "foo",
|
||
|
# "foo/bar", "foo/bar/baz", etc.
|
||
|
output.append(f'(?:(?P<{_DIR_MARK}>/).*)?')
|
||
|
|
||
|
need_slash = True
|
||
|
|
||
|
output.append('$')
|
||
|
regex = ''.join(output)
|
||
|
|
||
|
else:
|
||
|
# Use regex override.
|
||
|
regex = override_regex
|
||
|
|
||
|
else:
|
||
|
# A blank pattern is a null-operation (neither includes nor excludes
|
||
|
# files).
|
||
|
regex = None
|
||
|
include = None
|
||
|
|
||
|
if regex is not None and return_type is bytes:
|
||
|
regex = regex.encode(_BYTES_ENCODING)
|
||
|
|
||
|
return regex, include
|
||
|
|
||
|
@staticmethod
|
||
|
def _translate_segment_glob(pattern: str) -> str:
|
||
|
"""
|
||
|
Translates the glob pattern to a regular expression. This is used in the
|
||
|
constructor to translate a path segment glob pattern to its corresponding
|
||
|
regular expression.
|
||
|
|
||
|
*pattern* (:class:`str`) is the glob pattern.
|
||
|
|
||
|
Returns the regular expression (:class:`str`).
|
||
|
"""
|
||
|
# NOTE: This is derived from `fnmatch.translate()` and is similar to the
|
||
|
# POSIX function `fnmatch()` with the `FNM_PATHNAME` flag set.
|
||
|
|
||
|
escape = False
|
||
|
regex = ''
|
||
|
i, end = 0, len(pattern)
|
||
|
while i < end:
|
||
|
# Get next character.
|
||
|
char = pattern[i]
|
||
|
i += 1
|
||
|
|
||
|
if escape:
|
||
|
# Escape the character.
|
||
|
escape = False
|
||
|
regex += re.escape(char)
|
||
|
|
||
|
elif char == '\\':
|
||
|
# Escape character, escape next character.
|
||
|
escape = True
|
||
|
|
||
|
elif char == '*':
|
||
|
# Multi-character wildcard. Match any string (except slashes), including
|
||
|
# an empty string.
|
||
|
regex += '[^/]*'
|
||
|
|
||
|
elif char == '?':
|
||
|
# Single-character wildcard. Match any single character (except a
|
||
|
# slash).
|
||
|
regex += '[^/]'
|
||
|
|
||
|
elif char == '[':
|
||
|
# Bracket expression wildcard. Except for the beginning exclamation
|
||
|
# mark, the whole bracket expression can be used directly as regex, but
|
||
|
# we have to find where the expression ends.
|
||
|
# - "[][!]" matches ']', '[' and '!'.
|
||
|
# - "[]-]" matches ']' and '-'.
|
||
|
# - "[!]a-]" matches any character except ']', 'a' and '-'.
|
||
|
j = i
|
||
|
|
||
|
# Pass bracket expression negation.
|
||
|
if j < end and (pattern[j] == '!' or pattern[j] == '^'):
|
||
|
j += 1
|
||
|
|
||
|
# Pass first closing bracket if it is at the beginning of the
|
||
|
# expression.
|
||
|
if j < end and pattern[j] == ']':
|
||
|
j += 1
|
||
|
|
||
|
# Find closing bracket. Stop once we reach the end or find it.
|
||
|
while j < end and pattern[j] != ']':
|
||
|
j += 1
|
||
|
|
||
|
if j < end:
|
||
|
# Found end of bracket expression. Increment j to be one past the
|
||
|
# closing bracket:
|
||
|
#
|
||
|
# [...]
|
||
|
# ^ ^
|
||
|
# i j
|
||
|
#
|
||
|
j += 1
|
||
|
expr = '['
|
||
|
|
||
|
if pattern[i] == '!':
|
||
|
# Bracket expression needs to be negated.
|
||
|
expr += '^'
|
||
|
i += 1
|
||
|
elif pattern[i] == '^':
|
||
|
# POSIX declares that the regex bracket expression negation "[^...]"
|
||
|
# is undefined in a glob pattern. Python's `fnmatch.translate()`
|
||
|
# escapes the caret ('^') as a literal. Git supports the using a
|
||
|
# caret for negation. Maintain consistency with Git because that is
|
||
|
# the expected behavior.
|
||
|
expr += '^'
|
||
|
i += 1
|
||
|
|
||
|
# Build regex bracket expression. Escape slashes so they are treated
|
||
|
# as literal slashes by regex as defined by POSIX.
|
||
|
expr += pattern[i:j].replace('\\', '\\\\')
|
||
|
|
||
|
# Add regex bracket expression to regex result.
|
||
|
regex += expr
|
||
|
|
||
|
# Set i to one past the closing bracket.
|
||
|
i = j
|
||
|
|
||
|
else:
|
||
|
# Failed to find closing bracket, treat opening bracket as a bracket
|
||
|
# literal instead of as an expression.
|
||
|
regex += '\\['
|
||
|
|
||
|
else:
|
||
|
# Regular character, escape it for regex.
|
||
|
regex += re.escape(char)
|
||
|
|
||
|
if escape:
|
||
|
raise ValueError(f"Escape character found with no next character to escape: {pattern!r}")
|
||
|
|
||
|
return regex
|
||
|
|
||
|
@staticmethod
|
||
|
def escape(s: AnyStr) -> AnyStr:
|
||
|
"""
|
||
|
Escape special characters in the given string.
|
||
|
|
||
|
*s* (:class:`str` or :class:`bytes`) a filename or a string that you want to
|
||
|
escape, usually before adding it to a ".gitignore".
|
||
|
|
||
|
Returns the escaped string (:class:`str` or :class:`bytes`).
|
||
|
"""
|
||
|
if isinstance(s, str):
|
||
|
return_type = str
|
||
|
string = s
|
||
|
elif isinstance(s, bytes):
|
||
|
return_type = bytes
|
||
|
string = s.decode(_BYTES_ENCODING)
|
||
|
else:
|
||
|
raise TypeError(f"s:{s!r} is not a unicode or byte string.")
|
||
|
|
||
|
# Reference: https://git-scm.com/docs/gitignore#_pattern_format
|
||
|
meta_characters = r"[]!*#?"
|
||
|
|
||
|
out_string = "".join("\\" + x if x in meta_characters else x for x in string)
|
||
|
|
||
|
if return_type is bytes:
|
||
|
return out_string.encode(_BYTES_ENCODING)
|
||
|
else:
|
||
|
return out_string
|
||
|
|
||
|
util.register_pattern('gitwildmatch', GitWildMatchPattern)
|
||
|
|
||
|
|
||
|
class GitIgnorePattern(GitWildMatchPattern):
|
||
|
"""
|
||
|
The :class:`GitIgnorePattern` class is deprecated by :class:`GitWildMatchPattern`.
|
||
|
This class only exists to maintain compatibility with v0.4.
|
||
|
"""
|
||
|
|
||
|
def __init__(self, *args, **kw) -> None:
|
||
|
"""
|
||
|
Warn about deprecation.
|
||
|
"""
|
||
|
self._deprecated()
|
||
|
super(GitIgnorePattern, self).__init__(*args, **kw)
|
||
|
|
||
|
@staticmethod
|
||
|
def _deprecated() -> None:
|
||
|
"""
|
||
|
Warn about deprecation.
|
||
|
"""
|
||
|
warnings.warn((
|
||
|
"GitIgnorePattern ('gitignore') is deprecated. Use GitWildMatchPattern "
|
||
|
"('gitwildmatch') instead."
|
||
|
), DeprecationWarning, stacklevel=3)
|
||
|
|
||
|
@classmethod
|
||
|
def pattern_to_regex(cls, *args, **kw):
|
||
|
"""
|
||
|
Warn about deprecation.
|
||
|
"""
|
||
|
cls._deprecated()
|
||
|
return super(GitIgnorePattern, cls).pattern_to_regex(*args, **kw)
|
||
|
|
||
|
# Register `GitIgnorePattern` as "gitignore" for backward compatibility with
|
||
|
# v0.4.
|
||
|
util.register_pattern('gitignore', GitIgnorePattern)
|