mirror of https://github.com/python/cpython.git
GH-102613: Improve performance of `pathlib.Path.rglob()` (GH-104244)
Stop de-duplicating results in `_RecursiveWildcardSelector`. A new `_DoubleRecursiveWildcardSelector` class is introduced which performs de-duplication, but this is used _only_ for patterns with multiple non-adjacent `**` segments, such as `path.glob('**/foo/**')`. By avoiding the use of a set, `PurePath.__hash__()` is not called, and so paths do not need to be stringified and case-normalised. Also merge adjacent '**' segments in patterns.
This commit is contained in:
parent
8d95012c95
commit
c0ece3dc97
|
@ -64,17 +64,25 @@ def _is_case_sensitive(flavour):
|
|||
@functools.lru_cache()
|
||||
def _make_selector(pattern_parts, flavour, case_sensitive):
|
||||
pat = pattern_parts[0]
|
||||
child_parts = pattern_parts[1:]
|
||||
if not pat:
|
||||
return _TerminatingSelector()
|
||||
if pat == '**':
|
||||
cls = _RecursiveWildcardSelector
|
||||
elif pat == '..':
|
||||
cls = _ParentSelector
|
||||
elif '**' in pat:
|
||||
raise ValueError("Invalid pattern: '**' can only be an entire path component")
|
||||
child_parts_idx = 1
|
||||
while child_parts_idx < len(pattern_parts) and pattern_parts[child_parts_idx] == '**':
|
||||
child_parts_idx += 1
|
||||
child_parts = pattern_parts[child_parts_idx:]
|
||||
if '**' in child_parts:
|
||||
cls = _DoubleRecursiveWildcardSelector
|
||||
else:
|
||||
cls = _RecursiveWildcardSelector
|
||||
else:
|
||||
cls = _WildcardSelector
|
||||
child_parts = pattern_parts[1:]
|
||||
if pat == '..':
|
||||
cls = _ParentSelector
|
||||
elif '**' in pat:
|
||||
raise ValueError("Invalid pattern: '**' can only be an entire path component")
|
||||
else:
|
||||
cls = _WildcardSelector
|
||||
return cls(pat, child_parts, flavour, case_sensitive)
|
||||
|
||||
|
||||
|
@ -183,20 +191,32 @@ def _iterate_directories(self, parent_path, scandir):
|
|||
|
||||
def _select_from(self, parent_path, scandir):
|
||||
try:
|
||||
yielded = set()
|
||||
try:
|
||||
successor_select = self.successor._select_from
|
||||
for starting_point in self._iterate_directories(parent_path, scandir):
|
||||
for p in successor_select(starting_point, scandir):
|
||||
if p not in yielded:
|
||||
yield p
|
||||
yielded.add(p)
|
||||
finally:
|
||||
yielded.clear()
|
||||
successor_select = self.successor._select_from
|
||||
for starting_point in self._iterate_directories(parent_path, scandir):
|
||||
for p in successor_select(starting_point, scandir):
|
||||
yield p
|
||||
except PermissionError:
|
||||
return
|
||||
|
||||
|
||||
class _DoubleRecursiveWildcardSelector(_RecursiveWildcardSelector):
|
||||
"""
|
||||
Like _RecursiveWildcardSelector, but also de-duplicates results from
|
||||
successive selectors. This is necessary if the pattern contains
|
||||
multiple non-adjacent '**' segments.
|
||||
"""
|
||||
|
||||
def _select_from(self, parent_path, scandir):
|
||||
yielded = set()
|
||||
try:
|
||||
for p in super()._select_from(parent_path, scandir):
|
||||
if p not in yielded:
|
||||
yield p
|
||||
yielded.add(p)
|
||||
finally:
|
||||
yielded.clear()
|
||||
|
||||
|
||||
#
|
||||
# Public API
|
||||
#
|
||||
|
|
|
@ -1853,13 +1853,14 @@ def _check(path, pattern, case_sensitive, expected):
|
|||
|
||||
def test_rglob_common(self):
|
||||
def _check(glob, expected):
|
||||
self.assertEqual(set(glob), { P(BASE, q) for q in expected })
|
||||
self.assertEqual(sorted(glob), sorted(P(BASE, q) for q in expected))
|
||||
P = self.cls
|
||||
p = P(BASE)
|
||||
it = p.rglob("fileA")
|
||||
self.assertIsInstance(it, collections.abc.Iterator)
|
||||
_check(it, ["fileA"])
|
||||
_check(p.rglob("fileB"), ["dirB/fileB"])
|
||||
_check(p.rglob("**/fileB"), ["dirB/fileB"])
|
||||
_check(p.rglob("*/fileA"), [])
|
||||
if not os_helper.can_symlink():
|
||||
_check(p.rglob("*/fileB"), ["dirB/fileB"])
|
||||
|
@ -1883,9 +1884,12 @@ def _check(glob, expected):
|
|||
_check(p.rglob("*"), ["dirC/fileC", "dirC/novel.txt",
|
||||
"dirC/dirD", "dirC/dirD/fileD"])
|
||||
_check(p.rglob("file*"), ["dirC/fileC", "dirC/dirD/fileD"])
|
||||
_check(p.rglob("**/file*"), ["dirC/fileC", "dirC/dirD/fileD"])
|
||||
_check(p.rglob("dir*/**"), ["dirC/dirD"])
|
||||
_check(p.rglob("*/*"), ["dirC/dirD/fileD"])
|
||||
_check(p.rglob("*/"), ["dirC/dirD"])
|
||||
_check(p.rglob(""), ["dirC", "dirC/dirD"])
|
||||
_check(p.rglob("**"), ["dirC", "dirC/dirD"])
|
||||
# gh-91616, a re module regression
|
||||
_check(p.rglob("*.txt"), ["dirC/novel.txt"])
|
||||
_check(p.rglob("*.*"), ["dirC/novel.txt"])
|
||||
|
|
|
@ -0,0 +1,3 @@
|
|||
Improve performance of :meth:`pathlib.Path.glob` when expanding recursive
|
||||
wildcards ("``**``") by merging adjacent wildcards and de-duplicating
|
||||
results only when necessary.
|
Loading…
Reference in New Issue