Source code for tcutility.pathfunc

import os
import re
from typing import Dict, List
import glob

from tcutility import results

j = os.path.join


[docs] def split_all(path: str) -> List[str]: """ Split a path into all of its parts. Args: path: the path to be split, it will be separated using :func:`os.path.split`. Returns: A list of parts of the original path. Example: .. code-block:: python >>> split_all('a/b/c/d') ['a', 'b', 'c', 'd'] """ path = os.path.normpath(path) parts = [] while True: a, b = os.path.split(path) if not a or not b: parts.append(path) return parts[::-1] parts.append(b) path = a
[docs] def get_subdirectories(root: str, include_intermediates: bool = False, max_depth: int = None, _current_depth: int = 0) -> List[str]: """ Get all sub-directories of a root directory. Args: root: the root directory. include_intermediates: whether to include intermediate sub-directories instead of only the lowest levels. max_depth: the maximum depth depth to look for subdirectories, e.g. setting it to `1` will return only the contents of the `root` path. Returns: A list of sub-directories with ``root`` included in the paths. Example: Given a file-structure as follows: .. code-block:: root |- subdir_a | |- subsubdir_b | |- subsubdir_c |- subdir_b |- subdir_c Then we get the following outputs. .. tabs:: .. group-tab:: Including intermediates .. code-block:: python >>> get_subdirectories('root', include_intermediates=True) ['root', 'root/subdir_a', 'root/subdir_a/subsubdir_b', 'root/subdir_a/subsubdir_c', 'root/subdir_b', 'root/subdir_c'] .. group-tab:: Excluding intermediates .. code-block:: python >>> get_subdirectories('root', include_intermediates=False) ['root/subdir_a/subsubdir_b', 'root/subdir_a/subsubdir_c', 'root/subdir_b', 'root/subdir_c'] """ contents = [] if _current_depth == 0 and include_intermediates: contents.append(root) with os.scandir(root) as scanner: for entry in scanner: if entry.is_file(): continue if _current_depth == max_depth: contents.append(entry.path) continue sub_contents = list(get_subdirectories(entry.path, include_intermediates=include_intermediates, _current_depth=_current_depth+1, max_depth=max_depth)) if include_intermediates or len(sub_contents) == 0: contents.append(entry.path) contents.extend(sub_contents) return contents
[docs] def path_depth(path: str) -> int: """ Calculate the depth of a given path. """ return len(split_all(path))
[docs] def match(root: str, pattern: str, sort_by: str = None) -> Dict[str, dict]: """ Find and return information about subdirectories of a root that match a given pattern. Args: root: the root of the subdirectories to look in. pattern: a string specifying the pattern the subdirectories should correspond to. It should look similar to a format string, without the ``f`` in front of the string. Inside curly braces you can put a variable name, which you can later extract from the results. Anything inside curly braces will be matched to word characters (``[a-zA-Z0-9_-]``) including dashes and underscores. sort_by: the key to sort the results by. If not given, the results will be returned in the order they were found. Returns: A |Result| object containing the matched directories as keys and information (also |Result| object) about those matches as the values. Each information dictionary contains the variables given in the pattern. E.g. using a pattern such as ``{a}/{b}/{c}`` will populate the ``info.a``, ``info.b`` and ``info.c`` keys of the info |Result| object. Example: Given a file-structure as follows: .. code-block:: root |- NH3-BH3 | |- BLYP_QZ4P | | |- extra_dir | | |- blablabla | | | |- BLYP_TZ2P | | |- another_dir | | | |- M06-2X_TZ2P | |- SN2 | |- BLYP_TZ2P | |- M06-2X_TZ2P | | |- M06-2X_TZ2P We can run the following scripts to match the subdirectories. .. code-block:: python from tcutility import log # get the matches, we want to extract the system name (NH3-BH3 or SN2) # and the functional and basis-set # we don't want the subdirectories matches = match('root', '{system}/{functional}_{basis_set}') # print the matches as a table rows = [] for d, info in matches.items(): rows.append([d, info.system, info.functional, info.basis_set]) log.table(rows, ['Directory', 'System', 'Functional', 'Basis-Set']) which prints .. code-block:: [2024/01/17 14:39:08] Directory System Functional Basis-Set [2024/01/17 14:39:08] ─────────────────────────────────────────────────────────── [2024/01/17 14:39:08] root/SN2/M06-2X_TZ2P SN2 M06-2X TZ2P [2024/01/17 14:39:08] root/NH3-BH3/BLYP_TZ2P NH3-BH3 BLYP TZ2P [2024/01/17 14:39:08] root/NH3-BH3/M06-2X_TZ2P NH3-BH3 M06-2X TZ2P [2024/01/17 14:39:08] root/SN2/BLYP_TZ2P SN2 BLYP TZ2P [2024/01/17 14:39:08] root/NH3-BH3/BLYP_QZ4P NH3-BH3 BLYP QZ4P """ # get the number and names of substitutions in the given pattern substitutions = re.findall(r"{(\w+)}", pattern) # the pattern should resolve to words and may contain - and _ # given the substitutions we build a regex pattern and a glob pattern glob_pattern = pattern for sub in substitutions: pattern = pattern.replace("{" + sub + "}", "([a-zA-Z0-9_-]+)") glob_pattern = glob_pattern.replace("{" + sub + "}", "*") # get all applicable subdirectories subdirs = glob.glob(os.path.join(root, glob_pattern)) # compile a regular expression pattern to match with later regex = re.compile(pattern) # go through all applicable subdirectories and retrieve the information we want ret = results.Result() for subdir in subdirs: # subdir = os.path.relpath(subdir, root) subdir = subdir[len(f'{root}/'):] p = j(root, subdir) re_match = regex.fullmatch(subdir) ret[p] = results.Result(**{substitutions[i]: re_match.group(i + 1) for i in range(len(substitutions))}) if not sort_by: return ret # if requested we sort the results before returning them return results.Result(sorted(ret.items(), key=lambda d: d[1][sort_by]))