Source code for CatMatcher.match_configurator

import os
import numpy as np
from pathlib import Path
from dataclasses import dataclass
from typing import Optional, Literal, Union


[docs] @dataclass class MatchConfigurator: """ A configuration class for managing file matching operations using STILTS (Starlink Tables Infrastructure Library Tool Set). This class prepares and validates input parameters, infers file formats, and sets up a working directory structure for matching catalog files based on various configuration options. The first 3 listed attributes need to be defined by the user, whereas all others are either optional or set to default values. Attributes: file_list (Union[list, str]): List of input file names. Alternatively, a single file as a string, but note that this can only be used when a separate reference file is provided. :no-index: file_path (str): Path at which the input files are located. match_radius (float): Matching radius for positional matching, in arcseconds. match_values (Union[str, list], optional): Columns used for matching, default "RA DEC". If the match columns are not identical across all input columns, a list with all the column names, in the same order as the file_list, needs to be provided. output_mode (str, fixed): Output mode for the matcher tool. Currently frozen to one specific mode. output_file_name (str, optional): Name of the output file to write matched results, including the desired file type. See imft or ofmt for supported filetypes. (Default: "matched.csv) command_file_name (str, optional): Name of the command file used to run matching, without suffix. (Default: Nmatch_commands) cwd (str, optional): Name of the directory where matching outputs and scripts will be saved. matcher (Literal, ["sky", "skyerr", "exact"]): Type of matching engine to use. (Default: "sky") multimode (Literal, ["pairs", "group"]): Matching mode, either "pairs" or "group". (Default: "group") join_mode (Literal, ["default", "match", "nomatch", "always"]): How results are joined across matched/unmatched catalogs. (Default: "match") runner (Literal, ["parallel", "parallel-all", "sequential", "classic", "partest"]): Execution mode for the STILTS matcher. (Default: "parallel") progress (Literal, ["none", "log", "time", "profile"]): Logging/progress output during matching. (Default: "time") fixcols (Literal, ["none", "dups", "all"]): Determines how input columns are renamed in the output table, according to the suffix_list parameters. If "none", no columns are renamed, if "dups" only columns which would otherwise have duplicate names in the output are renamed, if "all" every column will be renamed. reference_file (Optional[str], optional): Optional reference file for input format inference. If provided, a single string input for file_list is acceptable. suffix_list (Optional[list], optional): Optional list of suffixes for identifying files. If None, suffixes will be numeric indices _1, _2,... according to the order of the file_list. iref (Optional[int], optional): If multimode="pairs" this parameter gives the index of the table in the file_list, which serves as the reference table, i.e. must be matched by other tables. input_command (Optional[str], optional): Custom command string indicating actions to be performed on columns of all input tables. output_command (Optional[str], optional): Custom command string indicating actions to be performed on columns of the output table. ifmt (Optional[list or Literal], ["colfits", "csv", "ecsv", "fits", "tst", "votable"]): Input format(s) for catalog files. Accepted formats are: ["colfits", "csv", "ecsv", "fits", "tst", "votable"]. If not provided, they will be inferred from the file_list. ofmt (Optional[Literal], ["colfits", "csv", "ecsv", "fits", "tst", "votable"]): Output format for result file. Accepted formats are: ["colfits", "csv", "ecsv", "fits", "tst", "votable"]. If not provided, it will be inferred from the output_file_name. """ # --------------------------- # Minimum necessary user-input file_list: Union[list, str] file_path: str match_radius: float # params = < match - params > #TODO: This can also be a list, but in a weird format # ---------------------------- # Variables with default values # A) free choice match_values: Union[str, list] = "RA DEC" output_mode: str = "out" # There are more options but not needed for now output_file_name: str = "matched.csv" command_file_name: str = "Nmatch_commands" cwd: str = "CatMatcher_cwd" # B) Multiple-choice matcher: Literal[ "sky", "skyerr", "exact"] = "sky" # More available: https://www.star.bris.ac.uk/mbt/stilts/sun256/MatchEngine.html multimode: Literal["pairs", "group"] = "group" # multimode = pairs | group join_mode: Literal["default", "match", "nomatch", "always"] = "match" runner: Literal["parallel", "parallel-all", "sequential", "classic", "partest"] = "parallel" progress: Literal["none", "log", "time", "profile"] = "time" fixcols: Literal["none", "dups", "all"] = "dups" # TODO: tuning: < tuning - params > # ---------------------------- # Optional reference_file: Optional[str] = None suffix_list: Optional[list] = None iref: Optional[str] = None input_command: Optional[str] = None output_command: Optional[str] = None ifmt: Optional[Literal["colfits", "csv", "ecsv", "fits", "tst", "votable"]] = None ofmt: Optional[Literal["colfits", "csv", "ecsv", "fits", "tst", "votable"]] = None def __post_init__(self): """ Post-initialization hook to infer further attributes from user input, set up internal variables and the necessary directory structure, and validate inputs. """ # ---------------------------- # A) Infer additional attributes from input variables self.command_file = f"{self.command_file_name}.txt" # ending MUST be .txt self.n_in = len(self.file_list) # infer suffix list if not provided if not self.suffix_list: self.suffix_list = [f"{i}" for i in range(1, self.n_in + 1)] elif len(self.suffix_list) != self.n_in: raise ValueError("Length of suffix-list does not match number of input files.") # Infer input format if not self.ifmt and self.reference_file: self.ifmt = self._infer_fmt(self.reference_file) elif not self.ifmt and not self.reference_file: self.ifmt = [self._infer_fmt(i) for i in self.file_list] # Infer output format if not self.ofmt and self.output_file_name: self.ofmt = self._infer_fmt(self.output_file_name) # ---------------------------- # B) Convert inputs if type(self.ifmt) == str: # needed for command printing of StiltsMatcher.build_N_match() self.ifmt = [self.ifmt] if type(self.match_values) == str: # needed for command printing of StiltsMatcher.build_N_match() self.match_values = [self.match_values] self.normalized_path = Path(self.file_path).as_posix() # normalize path (to work across systems) # ---------------------------- # C) Setup directory hierarchy for matching self._setup_stilts_directories() # ---------------------------- # D) User input verifications # check if file list is valid if any(s == "" or (isinstance(s, float) and np.isnan(s)) for s in self.file_list): raise ValueError("Empty strings or NAN entries encountered in input file list.") # check if suffix list is valid if self.suffix_list and any(s == "" or (isinstance(s, float) and np.isnan(s)) for s in self.suffix_list): raise ValueError("Empty strings or NAN entries encountered in user-provided suffix list.") @staticmethod def _infer_fmt(filename): """ Infer the file format from the file extension. Args: filename (str): The name of the file from which to infer the format. Returns: str: Inferred file format. Raises: ValueError: If file has no extension or format is unsupported. """ supported_formats = ["colfits", "csv", "ecsv", "fits", "tst", "votable"] fmt = filename.split(".", maxsplit=2)[-1] print(fmt, filename.split(".", maxsplit=2)) if fmt == filename: raise ValueError("No extension found") if fmt not in supported_formats: raise ValueError(f"Unsupported file format '{fmt}'. Allowed formats are: {sorted(supported_formats)}") return fmt def _generate_match_values_from_suffix(self): """ Generate a list of match columns based on the input string and the supplied suffix list. Useful when dealing with more than two input files and a shared base column name. Returns: None: No direct return, but results are written to self.match_value_list. """ if self.n_in > 2 and type(self.match_values) == str: match_columns = self.match_values.split(" ") # Handles any whitespace-separated values, no number limit self.match_value_list = [ " ".join(f"{val}_{suffix}" for val in match_columns) for suffix in self.suffix_list ] def _setup_stilts_directories(self): """ Create and initialize the working directory structure used by the StiltsMatcher class. Directories include the main working path, a scripts directory, and a matches directory. """ # define path to working directory (cwd) if self.normalized_path.endswith("/"): # no need to add the slash if provided in the path name cwd_path = self.normalized_path + self.cwd else: cwd_path = self.normalized_path + "/" + self.cwd # if pathname does not include slash, add it # Create various subdirectories script_dir = cwd_path + "/scripts/" match_dir = cwd_path + "/matches/" # assigin script_path variable because it is needed later to build the N match self._script_path = script_dir # store in directory_list dirs = [cwd_path, script_dir, match_dir] # create directories for d in dirs: try: os.mkdir(d) except FileExistsError: pass