From 1d2832b218de725b682636503943457e3cd6c602 Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Sun, 4 Jan 2026 11:57:14 -0600 Subject: [PATCH 01/15] docs: Harmonize docstrings to NumPy style with Python 3.10+ type hints MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Convert all docstrings to NumPy style (Parameters, Returns, Raises, Notes, Examples) - Add type hints using Python 3.10+ syntax (X | None, list[str]) - Add `from __future__ import annotations` for deferred evaluation - Add TYPE_CHECKING guards for circular import prevention - Create types.py with TypeAlias definitions (PrimaryKey, Row, etc.) - Modernize connection.py error handling with match/case pattern Files updated: - table.py, expression.py, fetch.py, autopopulate.py - schemas.py, connection.py, heading.py - condition.py, jobs.py - types.py (new) πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- src/datajoint/autopopulate.py | 349 +++++++++++------- src/datajoint/condition.py | 205 ++++++++--- src/datajoint/connection.py | 355 ++++++++++++------ src/datajoint/expression.py | 666 ++++++++++++++++++++++------------ src/datajoint/fetch.py | 82 +++-- src/datajoint/heading.py | 211 ++++++++--- src/datajoint/jobs.py | 253 +++++++++---- src/datajoint/schemas.py | 416 +++++++++++++++------ src/datajoint/table.py | 611 +++++++++++++++++++++---------- src/datajoint/types.py | 60 +++ 10 files changed, 2246 insertions(+), 962 deletions(-) create mode 100644 src/datajoint/types.py diff --git a/src/datajoint/autopopulate.py b/src/datajoint/autopopulate.py index d92a1edf6..6c4539760 100644 --- a/src/datajoint/autopopulate.py +++ b/src/datajoint/autopopulate.py @@ -1,5 +1,7 @@ """This module defines class dj.AutoPopulate""" +from __future__ import annotations + import contextlib import datetime import inspect @@ -7,6 +9,7 @@ import multiprocessing as mp import signal import traceback +from typing import TYPE_CHECKING, Any, Generator import deepdiff from tqdm import tqdm @@ -14,6 +17,10 @@ from .errors import DataJointError, LostConnectionError from .expression import AndList, QueryExpression +if TYPE_CHECKING: + from .jobs import Job + from .table import Table + # noinspection PyExceptionInherit,PyCallingNonCallable logger = logging.getLogger(__name__.split(".")[0]) @@ -22,10 +29,20 @@ # --- helper functions for multiprocessing -- -def _initialize_populate(table, jobs, populate_kwargs): +def _initialize_populate(table: Table, jobs: Job | None, populate_kwargs: dict[str, Any]) -> None: """ - Initialize the process for multiprocessing. - Saves the unpickled copy of the table to the current process and reconnects. + Initialize a worker process for multiprocessing. + + Saves the unpickled table to the current process and reconnects to database. + + Parameters + ---------- + table : Table + Table instance to populate. + jobs : Job or None + Job management object or None for direct mode. + populate_kwargs : dict + Arguments for _populate1(). """ process = mp.current_process() process.table = table @@ -34,11 +51,19 @@ def _initialize_populate(table, jobs, populate_kwargs): table.connection.connect() # reconnect -def _call_populate1(key): +def _call_populate1(key: dict[str, Any]) -> bool | tuple[dict[str, Any], Any]: """ - Call current process' table._populate1() - :key - a dict specifying job to compute - :return: key, error if error, otherwise None + Call _populate1() for a single key in the worker process. + + Parameters + ---------- + key : dict + Primary key specifying job to compute. + + Returns + ------- + bool or tuple + Result from _populate1(). """ process = mp.current_process() return process.table._populate1(key, process.jobs, **process.populate_kwargs) @@ -46,9 +71,22 @@ def _call_populate1(key): class AutoPopulate: """ - AutoPopulate is a mixin class that adds the method populate() to a Table class. - Auto-populated tables must inherit from both Table and AutoPopulate, - must define the property `key_source`, and must define the callback method `make`. + Mixin class that adds automated population to Table classes. + + Auto-populated tables (Computed, Imported) inherit from both Table and + AutoPopulate. They must implement the ``make()`` method that computes + and inserts data for one primary key. + + Attributes + ---------- + key_source : QueryExpression + Query yielding keys to be populated. Default is join of FK parents. + jobs : Job + Job table (``~~table_name``) for distributed processing. + + Notes + ----- + Subclasses may override ``key_source`` to customize population scope. """ _key_source = None @@ -56,15 +94,18 @@ class AutoPopulate: _jobs = None @property - def jobs(self): + def jobs(self) -> Job: """ Access the job table for this auto-populated table. - The job table (~~table_name) is created lazily on first access. + The job table (``~~table_name``) is created lazily on first access. It tracks job status, priority, scheduling, and error information for distributed populate operations. - :return: Job object for this table + Returns + ------- + Job + Job management object for this table. """ if self._jobs is None: from .jobs import Job @@ -74,7 +115,7 @@ def jobs(self): self._jobs.declare() return self._jobs - def _declare_check(self, primary_key, fk_attribute_map): + def _declare_check(self, primary_key: list[str], fk_attribute_map: dict[str, tuple[str, str]]) -> None: """ Validate FK-only primary key constraint for auto-populated tables. @@ -82,12 +123,18 @@ def _declare_check(self, primary_key, fk_attribute_map): attributes from foreign key references. This ensures proper job granularity for distributed populate operations. - This validation can be bypassed by setting: - dj.config.jobs.allow_new_pk_fields_in_computed_tables = True - - :param primary_key: list of primary key attribute names - :param fk_attribute_map: dict mapping child_attr -> (parent_table, parent_attr) - :raises DataJointError: if native PK attributes are found (unless bypassed) + Parameters + ---------- + primary_key : list + List of primary key attribute names. + fk_attribute_map : dict + Mapping of child_attr -> (parent_table, parent_attr). + + Raises + ------ + DataJointError + If native (non-FK) PK attributes are found, unless bypassed via + ``dj.config.jobs.allow_new_pk_fields_in_computed_tables = True``. """ from .settings import config @@ -110,13 +157,22 @@ def _declare_check(self, primary_key, fk_attribute_map): ) @property - def key_source(self): + def key_source(self) -> QueryExpression: """ - :return: the query expression that yields primary key values to be passed, - sequentially, to the ``make`` method when populate() is called. - The default value is the join of the parent tables references from the primary key. - Subclasses may override they key_source to change the scope or the granularity - of the make calls. + Query expression yielding keys to be populated. + + Returns the primary key values to be passed sequentially to ``make()`` + when ``populate()`` is called. The default is the join of parent tables + referenced from the primary key. + + Returns + ------- + QueryExpression + Expression yielding keys for population. + + Notes + ----- + Subclasses may override to change the scope or granularity of make calls. """ def _rename_attributes(table, props): @@ -135,50 +191,40 @@ def _rename_attributes(table, props): self._key_source *= _rename_attributes(*q) return self._key_source - def make(self, key): + def make(self, key: dict[str, Any]) -> None | Generator[Any, Any, None]: """ - This method must be implemented by derived classes to perform automated computation. - The method must implement the following three steps: - - 1. Fetch data from tables above in the dependency hierarchy, restricted by the given key. - 2. Compute secondary attributes based on the fetched data. - 3. Insert the new tuple(s) into the current table. - - The method can be implemented either as: - (a) Regular method: All three steps are performed in a single database transaction. - The method must return None. - (b) Generator method: - The make method is split into three functions: - - `make_fetch`: Fetches data from the parent tables. - - `make_compute`: Computes secondary attributes based on the fetched data. - - `make_insert`: Inserts the computed data into the current table. - - Then populate logic is executes as follows: - - - fetched_data1 = self.make_fetch(key) - computed_result = self.make_compute(key, *fetched_data1) - begin transaction: - fetched_data2 = self.make_fetch(key) - if fetched_data1 != fetched_data2: - cancel transaction - else: - self.make_insert(key, *computed_result) - commit_transaction - + Compute and insert data for one key. - Importantly, the output of make_fetch is a tuple that serves as the input into `make_compute`. - The output of `make_compute` is a tuple that serves as the input into `make_insert`. + Must be implemented by subclasses to perform automated computation. + The method implements three steps: - The functionality must be strictly divided between these three methods: - - All database queries must be completed in `make_fetch`. - - All computation must be completed in `make_compute`. - - All database inserts must be completed in `make_insert`. + 1. Fetch data from parent tables, restricted by the given key + 2. Compute secondary attributes based on the fetched data + 3. Insert the new row(s) into the current table - DataJoint may programmatically enforce this separation in the future. + Parameters + ---------- + key : dict + Primary key value identifying the entity to compute. - :param key: The primary key value used to restrict the data fetching. - :raises NotImplementedError: If the derived class does not implement the required methods. + Raises + ------ + NotImplementedError + If neither ``make()`` nor the tripartite methods are implemented. + + Notes + ----- + **Simple make**: Implement as a regular method that performs all three + steps in a single database transaction. Must return None. + + **Tripartite make**: For long-running computations, implement: + + - ``make_fetch(key)``: Fetch data from parent tables + - ``make_compute(key, *fetched_data)``: Compute results + - ``make_insert(key, *computed_result)``: Insert results + + The tripartite pattern allows computation outside the transaction, + with referential integrity checking before commit. """ if not (hasattr(self, "make_fetch") and hasattr(self, "make_insert") and hasattr(self, "make_compute")): @@ -204,9 +250,19 @@ def make(self, key): self.make_insert(key, *computed_result) yield - def _jobs_to_do(self, restrictions): + def _jobs_to_do(self, restrictions: tuple) -> QueryExpression: """ - :return: the query yielding the keys to be computed (derived from self.key_source) + Return the query yielding keys to be computed. + + Parameters + ---------- + restrictions : tuple + Conditions to filter key_source. + + Returns + ------- + QueryExpression + Keys derived from key_source that need computation. """ if self.restriction: raise DataJointError( @@ -234,44 +290,61 @@ def _jobs_to_do(self, restrictions): def populate( self, - *restrictions, - suppress_errors=False, - return_exception_objects=False, - reserve_jobs=False, - max_calls=None, - display_progress=False, - processes=1, - make_kwargs=None, - priority=None, - refresh=None, - ): + *restrictions: Any, + suppress_errors: bool = False, + return_exception_objects: bool = False, + reserve_jobs: bool = False, + max_calls: int | None = None, + display_progress: bool = False, + processes: int = 1, + make_kwargs: dict[str, Any] | None = None, + priority: int | None = None, + refresh: bool | None = None, + ) -> dict[str, Any]: """ - ``table.populate()`` calls ``table.make(key)`` for every primary key in - ``self.key_source`` for which there is not already a tuple in table. - - Two execution modes: - - **Direct mode** (reserve_jobs=False, default): - Keys computed directly from: (key_source & restrictions) - target - No job table involvement. Suitable for single-worker scenarios, - development, and debugging. - - **Distributed mode** (reserve_jobs=True): - Uses the job table (~~table_name) for multi-worker coordination. - Supports priority, scheduling, and status tracking. - - :param restrictions: conditions to filter key_source - :param suppress_errors: if True, collect errors instead of raising - :param return_exception_objects: return error objects instead of just error messages - :param reserve_jobs: if True, use job table for distributed processing - :param max_calls: maximum number of make() calls (total across all processes) - :param display_progress: if True, show progress bar - :param processes: number of worker processes - :param make_kwargs: keyword arguments passed to each make() call - :param priority: (reserve_jobs only) only process jobs at this priority or more urgent - :param refresh: (reserve_jobs only) refresh job queue before processing. - Default from config.jobs.auto_refresh - :return: dict with "success_count" and "error_list" + Populate the table by calling ``make()`` for unpopulated keys. + + Calls ``make(key)`` for every primary key in ``key_source`` for which + there is not already a row in this table. + + Parameters + ---------- + *restrictions + Conditions to filter key_source. + suppress_errors : bool, optional + If True, collect errors instead of raising. Default False. + return_exception_objects : bool, optional + If True, return exception objects instead of messages. Default False. + reserve_jobs : bool, optional + If True, use job table for distributed processing. Default False. + max_calls : int, optional + Maximum number of ``make()`` calls. + display_progress : bool, optional + If True, show progress bar. Default False. + processes : int, optional + Number of worker processes. Default 1. + make_kwargs : dict, optional + Keyword arguments passed to each ``make()`` call. + priority : int, optional + (Distributed mode) Only process jobs at this priority or higher. + refresh : bool, optional + (Distributed mode) Refresh job queue before processing. + Default from ``config.jobs.auto_refresh``. + + Returns + ------- + dict + ``{"success_count": int, "error_list": list}``. + + Notes + ----- + **Direct mode** (``reserve_jobs=False``): Keys computed from + ``(key_source & restrictions) - target``. No job table. Suitable for + single-worker, development, and debugging. + + **Distributed mode** (``reserve_jobs=True``): Uses job table + (``~~table_name``) for multi-worker coordination with priority and + status tracking. """ if self.connection.in_transaction: raise DataJointError("Populate cannot be called during a transaction.") @@ -457,16 +530,35 @@ def handler(signum, frame): finally: signal.signal(signal.SIGTERM, old_handler) - def _populate1(self, key, jobs, suppress_errors, return_exception_objects, make_kwargs=None): + def _populate1( + self, + key: dict[str, Any], + jobs: Job | None, + suppress_errors: bool, + return_exception_objects: bool, + make_kwargs: dict[str, Any] | None = None, + ) -> bool | tuple[dict[str, Any], Any]: """ - Populate table for one source key, calling self.make inside a transaction. - - :param key: dict specifying job to populate - :param jobs: the Job object or None if not reserve_jobs - :param suppress_errors: if True, errors are suppressed and returned - :param return_exception_objects: if True, errors returned as objects - :return: (key, error) when suppress_errors=True, - True if successfully invoke one make() call, otherwise False + Populate table for one key, calling make() inside a transaction. + + Parameters + ---------- + key : dict + Primary key specifying the job to populate. + jobs : Job or None + Job object for distributed mode, None for direct mode. + suppress_errors : bool + If True, errors are suppressed and returned. + return_exception_objects : bool + If True, return exception objects instead of messages. + make_kwargs : dict, optional + Keyword arguments passed to ``make()``. + + Returns + ------- + bool or tuple + True if make() succeeded, False if skipped (already done or reserved), + (key, error) tuple if suppress_errors=True and error occurred. """ import time @@ -552,16 +644,24 @@ def _populate1(self, key, jobs, suppress_errors, return_exception_objects, make_ finally: self.__class__._allow_insert = False - def progress(self, *restrictions, display=False): + def progress(self, *restrictions: Any, display: bool = False) -> tuple[int, int]: """ Report the progress of populating the table. Uses a single aggregation query to efficiently compute both total and remaining counts. - :param restrictions: conditions to restrict key_source - :param display: if True, log the progress - :return: (remaining, total) -- numbers of tuples to be populated + Parameters + ---------- + *restrictions + Conditions to restrict key_source. + display : bool, optional + If True, log the progress. Default False. + + Returns + ------- + tuple + (remaining, total) - number of keys yet to populate and total keys. """ todo = self._jobs_to_do(restrictions) @@ -633,11 +733,16 @@ def _update_job_metadata(self, key, start_time, duration, version): """ Update hidden job metadata for the given key. - Args: - key: Primary key dict identifying the row(s) to update - start_time: datetime when computation started - duration: float seconds elapsed - version: str code version (truncated to 64 chars) + Parameters + ---------- + key : dict + Primary key identifying the row(s) to update. + start_time : datetime + When computation started. + duration : float + Computation duration in seconds. + version : str + Code version (truncated to 64 chars). """ from .condition import make_condition diff --git a/src/datajoint/condition.py b/src/datajoint/condition.py index 24c898112..750208e5c 100644 --- a/src/datajoint/condition.py +++ b/src/datajoint/condition.py @@ -1,4 +1,11 @@ -"""methods for generating SQL WHERE clauses from datajoint restriction conditions""" +""" +SQL WHERE clause generation from DataJoint restriction conditions. + +This module provides utilities for converting various restriction formats +(dicts, strings, QueryExpressions) into SQL WHERE clauses. +""" + +from __future__ import annotations import collections import datetime @@ -9,18 +16,36 @@ import re import uuid from dataclasses import dataclass +from typing import TYPE_CHECKING, Any import numpy import pandas from .errors import DataJointError +if TYPE_CHECKING: + from .expression import QueryExpression + logger = logging.getLogger(__name__.split(".")[0]) JSON_PATTERN = re.compile(r"^(?P\w+)(\.(?P[\w.*\[\]]+))?(:(?P[\w(,\s)]+))?$") -def translate_attribute(key): +def translate_attribute(key: str) -> tuple[dict | None, str]: + """ + Translate an attribute key, handling JSON path notation. + + Parameters + ---------- + key : str + Attribute name, optionally with JSON path (e.g., ``"attr.path.field"``). + + Returns + ------- + tuple + (match_dict, sql_expression) where match_dict contains parsed + components or None if no JSON path. + """ match = JSON_PATTERN.match(key) if match is None: return match, key @@ -35,26 +60,35 @@ def translate_attribute(key): class PromiscuousOperand: """ - A container for an operand to ignore join compatibility + Wrapper to bypass join compatibility checking. + + Used when you want to force a natural join without semantic matching. + + Parameters + ---------- + operand : QueryExpression + The operand to wrap. """ - def __init__(self, operand): + def __init__(self, operand: QueryExpression) -> None: self.operand = operand class AndList(list): """ - A list of conditions to by applied to a query expression by logical conjunction: the - conditions are AND-ed. All other collections (lists, sets, other entity sets, etc) are - applied by logical disjunction (OR). - - Example: - expr2 = expr & dj.AndList((cond1, cond2, cond3)) - is equivalent to - expr2 = expr & cond1 & cond2 & cond3 + List of conditions combined with logical AND. + + All conditions in the list are AND-ed together. Other collections + (lists, sets, QueryExpressions) are OR-ed. + + Examples + -------- + >>> expr & dj.AndList((cond1, cond2, cond3)) + # equivalent to + >>> expr & cond1 & cond2 & cond3 """ - def append(self, restriction): + def append(self, restriction: Any) -> None: if isinstance(restriction, AndList): # extend to reduce nesting self.extend(restriction) @@ -65,15 +99,25 @@ def append(self, restriction): @dataclass class Top: """ - A restriction to the top entities of a query. - In SQL, this corresponds to ORDER BY ... LIMIT ... OFFSET + Restrict query to top N entities with ordering. + + In SQL, corresponds to ``ORDER BY ... LIMIT ... OFFSET``. + + Parameters + ---------- + limit : int, optional + Maximum number of rows to return. Default 1. + order_by : str or list[str], optional + Attributes to order by. ``"KEY"`` for primary key. Default ``"KEY"``. + offset : int, optional + Number of rows to skip. Default 0. """ limit: int | None = 1 order_by: str | list[str] = "KEY" offset: int = 0 - def __post_init__(self): + def __post_init__(self) -> None: self.order_by = self.order_by or ["KEY"] self.offset = self.offset or 0 @@ -92,30 +136,54 @@ def __post_init__(self): class Not: - """invert restriction""" - - def __init__(self, restriction): - self.restriction = restriction + """ + Invert a restriction condition. + Parameters + ---------- + restriction : any + Restriction condition to negate. -def assert_join_compatibility(expr1, expr2, semantic_check=True): + Examples + -------- + >>> table - condition # equivalent to table & Not(condition) """ - Determine if expressions expr1 and expr2 are join-compatible. - With semantic_check=True (default): - Raises an error if there are non-homologous namesakes (same name, different lineage). - This prevents accidental joins on attributes that share names but represent - different entities. + def __init__(self, restriction: Any) -> None: + self.restriction = restriction - If the ~lineage table doesn't exist for either schema, a warning is issued - and semantic checking is disabled (join proceeds as natural join). - With semantic_check=False: - No lineage checking. All namesake attributes are matched (natural join behavior). +def assert_join_compatibility( + expr1: QueryExpression, + expr2: QueryExpression, + semantic_check: bool = True, +) -> None: + """ + Check if two expressions are join-compatible. + + Parameters + ---------- + expr1 : QueryExpression + First expression. + expr2 : QueryExpression + Second expression. + semantic_check : bool, optional + If True (default), use semantic matching and error on non-homologous + namesakes (same name, different lineage). If False, use natural join. + + Raises + ------ + DataJointError + If semantic_check is True and expressions have non-homologous namesakes. + + Notes + ----- + With semantic_check=True: + Prevents accidental joins on attributes that share names but represent + different entities. If ~lineage table doesn't exist, a warning is issued. - :param expr1: A QueryExpression object - :param expr2: A QueryExpression object - :param semantic_check: If True (default), use semantic matching and error on conflicts + With semantic_check=False: + All namesake attributes are matched (natural join behavior). """ from .expression import QueryExpression, U @@ -151,16 +219,44 @@ def assert_join_compatibility(expr1, expr2, semantic_check=True): ) -def make_condition(query_expression, condition, columns, semantic_check=True): +def make_condition( + query_expression: QueryExpression, + condition: Any, + columns: set[str], + semantic_check: bool = True, +) -> str | bool: """ - Translate the input condition into the equivalent SQL condition (a string) - - :param query_expression: a dj.QueryExpression object to apply condition - :param condition: any valid restriction object. - :param columns: a set passed by reference to collect all column names used in the - condition. - :param semantic_check: If True (default), use semantic matching and error on conflicts. - :return: an SQL condition string or a boolean value. + Translate a restriction into an SQL WHERE clause condition. + + Parameters + ---------- + query_expression : QueryExpression + The expression to apply the condition to. + condition : any + Valid restriction: str, dict, bool, QueryExpression, AndList, + numpy.void, pandas.DataFrame, or iterable of restrictions. + columns : set[str] + Set passed by reference to collect column names used in the condition. + semantic_check : bool, optional + If True (default), use semantic matching and error on conflicts. + + Returns + ------- + str or bool + SQL condition string, or bool if condition evaluates to constant. + + Notes + ----- + Restriction types are processed as follows: + + - ``str``: Used directly as SQL condition + - ``dict``: AND of equality conditions for matching attributes + - ``bool``: Returns the boolean value (possibly negated) + - ``QueryExpression``: Generates subquery (semijoin/antijoin) + - ``AndList``: AND of all conditions + - ``list/set/tuple``: OR of all conditions + - ``numpy.void``: Like dict, from record array + - ``pandas.DataFrame``: Converted to records, then OR-ed """ from .expression import Aggregation, QueryExpression, U @@ -296,14 +392,27 @@ def combine_conditions(negate, conditions): return f"{'NOT ' if negate else ''} ({' OR '.join(or_list)})" if or_list else negate -def extract_column_names(sql_expression): +def extract_column_names(sql_expression: str) -> set[str]: """ - extract all presumed column names from an sql expression such as the WHERE clause, - for example. + Extract column names from an SQL expression. + + Parameters + ---------- + sql_expression : str + SQL expression (e.g., WHERE clause) to parse. + + Returns + ------- + set[str] + Set of extracted column names. + + Notes + ----- + Parsing is MySQL-specific. Identifies columns by: - :param sql_expression: a string containing an SQL expression - :return: set of extracted column names - This may be MySQL-specific for now. + 1. Names in backticks (``\`column\```) + 2. Bare identifiers not followed by ``(`` (excludes functions) + 3. Excludes SQL reserved words (IS, IN, AND, OR, etc.) """ assert isinstance(sql_expression, str) result = set() diff --git a/src/datajoint/connection.py b/src/datajoint/connection.py index 66d926694..be5e6183b 100644 --- a/src/datajoint/connection.py +++ b/src/datajoint/connection.py @@ -3,12 +3,15 @@ the ``conn`` function that provides access to a persistent connection in datajoint. """ +from __future__ import annotations + import logging import pathlib import re import warnings from contextlib import contextmanager from getpass import getpass +from typing import Any, Callable import pymysql as client @@ -26,67 +29,98 @@ cache_key = "query_cache" # the key to lookup the query_cache folder in dj.config -def translate_query_error(client_error, query): +def translate_query_error(client_error: Exception, query: str) -> Exception: """ - Take client error and original query and return the corresponding DataJoint exception. - - :param client_error: the exception raised by the client interface - :param query: sql query with placeholders - :return: an instance of the corresponding subclass of datajoint.errors.DataJointError + Translate client error to the corresponding DataJoint exception. + + Parameters + ---------- + client_error : Exception + The exception raised by the client interface. + query : str + SQL query with placeholders. + + Returns + ------- + Exception + An instance of the corresponding DataJoint error subclass, + or the original error if no mapping exists. """ logger.debug("type: {}, args: {}".format(type(client_error), client_error.args)) err, *args = client_error.args - # Loss of connection errors - if err in (0, "(0, '')"): - return errors.LostConnectionError("Server connection lost due to an interface error.", *args) - if err == 2006: - return errors.LostConnectionError("Connection timed out", *args) - if err == 2013: - return errors.LostConnectionError("Server connection lost", *args) - # Access errors - if err in (1044, 1142): - return errors.AccessError("Insufficient privileges.", args[0], query) - # Integrity errors - if err == 1062: - return errors.DuplicateError(*args) - if err == 1217: # MySQL 8 error code - return errors.IntegrityError(*args) - if err == 1451: - return errors.IntegrityError(*args) - if err == 1452: - return errors.IntegrityError(*args) - # Syntax errors - if err == 1064: - return errors.QuerySyntaxError(args[0], query) - # Existence errors - if err == 1146: - return errors.MissingTableError(args[0], query) - if err == 1364: - return errors.MissingAttributeError(*args) - if err == 1054: - return errors.UnknownAttributeError(*args) - # all the other errors are re-raised in original form - return client_error - - -def conn(host=None, user=None, password=None, *, init_fun=None, reset=False, use_tls=None): + match err: + # Loss of connection errors + case 0 | "(0, '')": + return errors.LostConnectionError("Server connection lost due to an interface error.", *args) + case 2006: + return errors.LostConnectionError("Connection timed out", *args) + case 2013: + return errors.LostConnectionError("Server connection lost", *args) + + # Access errors + case 1044 | 1142: + return errors.AccessError("Insufficient privileges.", args[0], query) + + # Integrity errors + case 1062: + return errors.DuplicateError(*args) + case 1217 | 1451 | 1452: + return errors.IntegrityError(*args) + + # Syntax errors + case 1064: + return errors.QuerySyntaxError(args[0], query) + + # Existence errors + case 1146: + return errors.MissingTableError(args[0], query) + case 1364: + return errors.MissingAttributeError(*args) + case 1054: + return errors.UnknownAttributeError(*args) + + # All other errors pass through unchanged + case _: + return client_error + + +def conn( + host: str | None = None, + user: str | None = None, + password: str | None = None, + *, + init_fun: Callable | None = None, + reset: bool = False, + use_tls: bool | dict | None = None, +) -> Connection: """ - Returns a persistent connection object to be shared by multiple modules. + Return a persistent connection object shared by multiple modules. + If the connection is not yet established or reset=True, a new connection is set up. - If connection information is not provided, it is taken from config which takes the - information from dj_local_conf.json. If the password is not specified in that file - datajoint prompts for the password. - - :param host: hostname - :param user: mysql user - :param password: mysql password - :param init_fun: initialization function - :param reset: whether the connection should be reset or not - :param use_tls: TLS encryption option. Valid options are: True (required), False - (required no TLS), None (TLS preferred, default), dict (Manually specify values per - https://dev.mysql.com/doc/refman/8.0/en/connection-options.html#encrypted-connection-options). + If connection information is not provided, it is taken from config. + + Parameters + ---------- + host : str, optional + Database hostname. + user : str, optional + MySQL username. + password : str, optional + MySQL password. Prompts if not provided. + init_fun : callable, optional + Initialization function called after connection. + reset : bool, optional + If True, reset existing connection. Default False. + use_tls : bool or dict, optional + TLS encryption option: True (required), False (no TLS), + None (preferred, default), or dict for manual configuration. + + Returns + ------- + Connection + Persistent database connection. """ if not hasattr(conn, "connection") or reset: host = host if host is not None else config["database.host"] @@ -128,20 +162,43 @@ def rowcount(self): class Connection: """ - A dj.Connection object manages a connection to a database server. - It also catalogues modules, schemas, tables, and their dependencies (foreign keys). - - Most of the parameters below should be set in the local configuration file. - - :param host: host name, may include port number as hostname:port, in which case it overrides the value in port - :param user: user name - :param password: password - :param port: port number - :param init_fun: connection initialization function (SQL) - :param use_tls: TLS encryption option + Manages a connection to a database server. + + Catalogues schemas, tables, and their dependencies (foreign keys). + Most parameters should be set in the configuration file. + + Parameters + ---------- + host : str + Hostname, may include port as ``hostname:port``. + user : str + Database username. + password : str + Database password. + port : int, optional + Port number. Overridden if specified in host. + init_fun : str, optional + SQL initialization command. + use_tls : bool or dict, optional + TLS encryption option. + + Attributes + ---------- + schemas : dict + Registered schema objects. + dependencies : Dependencies + Foreign key dependency graph. """ - def __init__(self, host, user, password, port=None, init_fun=None, use_tls=None): + def __init__( + self, + host: str, + user: str, + password: str, + port: int | None = None, + init_fun: str | None = None, + use_tls: bool | dict | None = None, + ) -> None: if ":" in host: # the port in the hostname overrides the port argument host, port = host.split(":") @@ -172,8 +229,8 @@ def __repr__(self): connected = "connected" if self.is_connected else "disconnected" return "DataJoint connection ({connected}) {user}@{host}:{port}".format(connected=connected, **self.conn_info) - def connect(self): - """Connect to the database server.""" + def connect(self) -> None: + """Establish or re-establish connection to the database server.""" with warnings.catch_warnings(): warnings.filterwarnings("ignore", ".*deprecated.*") try: @@ -198,38 +255,67 @@ def connect(self): ) self._conn.autocommit(True) - def set_query_cache(self, query_cache=None): + def set_query_cache(self, query_cache: str | None = None) -> None: """ - When query_cache is not None, the connection switches into the query caching mode, which entails: - 1. Only SELECT queries are allowed. - 2. The results of queries are cached under the path indicated by dj.config['query_cache'] - 3. query_cache is a string that differentiates different cache states. - - :param query_cache: a string to initialize the hash for query results + Enable query caching mode. + + When enabled: + 1. Only SELECT queries are allowed + 2. Results are cached under ``dj.config['query_cache']`` + 3. Cache key differentiates cache states + + Parameters + ---------- + query_cache : str, optional + String to initialize the hash for query results. + None disables caching. """ self._query_cache = query_cache - def purge_query_cache(self): - """Purges all query cache.""" + def purge_query_cache(self) -> None: + """Delete all cached query results.""" if isinstance(config.get(cache_key), str) and pathlib.Path(config[cache_key]).is_dir(): for path in pathlib.Path(config[cache_key]).iterdir(): if not path.is_dir(): path.unlink() - def close(self): + def close(self) -> None: + """Close the database connection.""" self._conn.close() - def register(self, schema): + def register(self, schema) -> None: + """ + Register a schema with this connection. + + Parameters + ---------- + schema : Schema + Schema object to register. + """ self.schemas[schema.database] = schema self.dependencies.clear() - def ping(self): - """Ping the connection or raises an exception if the connection is closed.""" + def ping(self) -> None: + """ + Ping the server to verify connection is alive. + + Raises + ------ + Exception + If the connection is closed. + """ self._conn.ping(reconnect=False) @property - def is_connected(self): - """Return true if the object is connected to the database server.""" + def is_connected(self) -> bool: + """ + Check if connected to the database server. + + Returns + ------- + bool + True if connected. + """ try: self.ping() except: @@ -247,16 +333,40 @@ def _execute_query(cursor, query, args, suppress_warnings): except client.err.Error as err: raise translate_query_error(err, query) - def query(self, query, args=(), *, as_dict=False, suppress_warnings=True, reconnect=None): + def query( + self, + query: str, + args: tuple = (), + *, + as_dict: bool = False, + suppress_warnings: bool = True, + reconnect: bool | None = None, + ): """ - Execute the specified query and return the tuple generator (cursor). - - :param query: SQL query - :param args: additional arguments for the client.cursor - :param as_dict: If as_dict is set to True, the returned cursor objects returns - query results as dictionary. - :param suppress_warnings: If True, suppress all warnings arising from underlying query library - :param reconnect: when None, get from config, when True, attempt to reconnect if disconnected + Execute a SQL query and return the cursor. + + Parameters + ---------- + query : str + SQL query to execute. + args : tuple, optional + Query parameters for prepared statement. + as_dict : bool, optional + If True, return rows as dictionaries. Default False. + suppress_warnings : bool, optional + If True, suppress SQL library warnings. Default True. + reconnect : bool, optional + If True, reconnect if disconnected. None uses config setting. + + Returns + ------- + cursor + Database cursor with query results. + + Raises + ------ + DataJointError + If non-SELECT query during query caching mode. """ # check cache first: use_query_cache = bool(self._query_cache) @@ -300,24 +410,39 @@ def query(self, query, args=(), *, as_dict=False, suppress_warnings=True, reconn return cursor - def get_user(self): + def get_user(self) -> str: """ - :return: the user name and host name provided by the client to the server. + Get the current user and host. + + Returns + ------- + str + User name and host as ``'user@host'``. """ return self.query("SELECT user()").fetchone()[0] # ---------- transaction processing @property - def in_transaction(self): + def in_transaction(self) -> bool: """ - :return: True if there is an open transaction. + Check if a transaction is open. + + Returns + ------- + bool + True if a transaction is in progress. """ self._in_transaction = self._in_transaction and self.is_connected return self._in_transaction - def start_transaction(self): + def start_transaction(self) -> None: """ - Starts a transaction error. + Start a new transaction. + + Raises + ------ + DataJointError + If a transaction is already in progress. """ if self.in_transaction: raise errors.DataJointError("Nested connections are not supported.") @@ -325,19 +450,14 @@ def start_transaction(self): self._in_transaction = True logger.debug("Transaction started") - def cancel_transaction(self): - """ - Cancels the current transaction and rolls back all changes made during the transaction. - """ + def cancel_transaction(self) -> None: + """Cancel the current transaction and roll back all changes.""" self.query("ROLLBACK") self._in_transaction = False logger.debug("Transaction cancelled. Rolling back ...") - def commit_transaction(self): - """ - Commit all changes made during the transaction and close it. - - """ + def commit_transaction(self) -> None: + """Commit all changes and close the transaction.""" self.query("COMMIT") self._in_transaction = False logger.debug("Transaction committed and closed.") @@ -347,14 +467,21 @@ def commit_transaction(self): @contextmanager def transaction(self): """ - Context manager for transactions. Opens an transaction and closes it after the with statement. - If an error is caught during the transaction, the commits are automatically rolled back. - All errors are raised again. - - Example: - >>> import datajoint as dj - >>> with dj.conn().transaction as conn: - >>> # transaction is open here + Context manager for transactions. + + Opens a transaction and automatically commits on success or rolls back + on exception. + + Yields + ------ + Connection + This connection object. + + Examples + -------- + >>> with dj.conn().transaction: + ... # All operations here are in one transaction + ... table.insert(data) """ try: self.start_transaction() diff --git a/src/datajoint/expression.py b/src/datajoint/expression.py index edf03a0a9..d469a9e78 100644 --- a/src/datajoint/expression.py +++ b/src/datajoint/expression.py @@ -1,8 +1,11 @@ +from __future__ import annotations + import copy import inspect import logging import re from itertools import count +from typing import TYPE_CHECKING, Any from .condition import ( AndList, @@ -22,6 +25,10 @@ from .preview import preview, repr_html from .settings import config +if TYPE_CHECKING: + from .connection import Connection + from .heading import Heading + logger = logging.getLogger(__name__.split(".")[0]) @@ -62,43 +69,85 @@ class QueryExpression: _distinct = False @property - def connection(self): - """a dj.Connection object""" + def connection(self) -> Connection: + """ + The database connection for this query. + + Returns + ------- + dj.Connection + Active database connection. + """ assert self._connection is not None return self._connection @property - def support(self): - """A list of table names or subqueries to from the FROM clause""" + def support(self) -> list[str | QueryExpression]: + """ + Tables or subqueries forming the FROM clause. + + Returns + ------- + list + Table names (str) or QueryExpression subqueries. + """ assert self._support is not None return self._support @property - def heading(self): - """a dj.Heading object, reflects the effects of the projection operator .proj""" + def heading(self) -> Heading: + """ + Column information after projection. + + Returns + ------- + dj.Heading + Heading reflecting any applied projection. + """ return self._heading @property - def original_heading(self): - """a dj.Heading object reflecting the attributes before projection""" + def original_heading(self) -> Heading: + """ + Column information before projection. + + Returns + ------- + dj.Heading + Original heading without projection effects. + """ return self._original_heading or self.heading @property - def restriction(self): - """a AndList object of restrictions applied to input to produce the result""" + def restriction(self) -> AndList: + """ + Restrictions applied to produce the result. + + Returns + ------- + AndList + Conjunction of restriction conditions. + """ if self._restriction is None: self._restriction = AndList() return self._restriction @property - def restriction_attributes(self): - """the set of attribute names invoked in the WHERE clause""" + def restriction_attributes(self) -> set[str]: + """ + Attribute names used in the WHERE clause. + + Returns + ------- + set + Names of attributes referenced by restrictions. + """ if self._restriction_attributes is None: self._restriction_attributes = set() return self._restriction_attributes @property - def primary_key(self): + def primary_key(self) -> list[str]: return self.heading.primary_key _subquery_alias_count = count() # count for alias names used in the FROM clause @@ -140,11 +189,19 @@ def sorting_clauses(self): return clause - def make_sql(self, fields=None): + def make_sql(self, fields: list[str] | None = None) -> str: """ - Make the SQL SELECT statement. + Generate the SQL SELECT statement for this query. - :param fields: used to explicitly set the select attributes + Parameters + ---------- + fields : list, optional + Attribute names to select. If None, uses heading attributes. + + Returns + ------- + str + Complete SQL SELECT statement. """ return "SELECT {distinct}{fields} FROM {from_}{where}{sorting}".format( distinct="DISTINCT " if self._distinct else "", @@ -155,64 +212,61 @@ def make_sql(self, fields=None): ) # --------- query operators ----------- - def make_subquery(self): - """create a new SELECT statement where self is the FROM clause""" + def make_subquery(self) -> QueryExpression: + """ + Create a new query with this expression as a subquery. + + Returns + ------- + QueryExpression + New expression with self in the FROM clause. + """ result = QueryExpression() result._connection = self.connection result._support = [self] result._heading = self.heading.make_subquery_heading() return result - def restrict(self, restriction, semantic_check=True): + def restrict(self, restriction: Any, semantic_check: bool = True) -> QueryExpression: """ - Produces a new expression with the new restriction applied. - - :param restriction: a sequence or an array (treated as OR list), another QueryExpression, - an SQL condition string, or an AndList. - :param semantic_check: If True (default), use semantic matching - only match on + Apply a restriction (WHERE clause) to this expression. + + Parameters + ---------- + restriction : various + Condition to apply. Can be: + + - str: SQL condition (e.g., ``"x > 5"``) + - dict: Attribute-value pairs (equality) + - QueryExpression: Match on common attributes + - AndList: Conjunction of conditions + - list/tuple: Disjunction (OR) of conditions + - bool: True = no effect, False = empty result + + semantic_check : bool, optional + If True (default), use semantic matching - only match on homologous namesakes and error on non-homologous namesakes. - If False, use natural matching on all namesakes (no lineage checking). - :return: A new QueryExpression with the restriction applied. - - rel.restrict(restriction) is equivalent to rel & restriction. - rel.restrict(Not(restriction)) is equivalent to rel - restriction - - The primary key of the result is unaffected. - Successive restrictions are combined as logical AND: r & a & b is equivalent to r & AndList((a, b)) - Any QueryExpression, collection, or sequence other than an AndList are treated as OrLists - (logical disjunction of conditions) - Inverse restriction is accomplished by either using the subtraction operator or the Not class. - - The expressions in each row equivalent: - - rel & True rel - rel & False the empty entity set - rel & 'TRUE' rel - rel & 'FALSE' the empty entity set - rel - cond rel & Not(cond) - rel - 'TRUE' rel & False - rel - 'FALSE' rel - rel & AndList((cond1,cond2)) rel & cond1 & cond2 - rel & AndList() rel - rel & [cond1, cond2] rel & OrList((cond1, cond2)) - rel & [] rel & False - rel & None rel & False - rel & any_empty_entity_set rel & False - rel - AndList((cond1,cond2)) rel & [Not(cond1), Not(cond2)] - rel - [cond1, cond2] rel & Not(cond1) & Not(cond2) - rel - AndList() rel & False - rel - [] rel - rel - None rel - rel - any_empty_entity_set rel - - When arg is another QueryExpression, the restriction rel & arg restricts rel to elements that match at least - one element in arg (hence arg is treated as an OrList). - Conversely, rel - arg restricts rel to elements that do not match any elements in arg. - Two elements match when their common attributes have equal values or when they have no common attributes. - All shared attributes must be in the primary key of either rel or arg or both or an error will be raised. - - QueryExpression.restrict is the only access point that modifies restrictions. All other operators must - ultimately call restrict() + If False, use natural matching on all namesakes. + + Returns + ------- + QueryExpression + New expression with restriction applied. + + Notes + ----- + ``rel & restriction`` is equivalent to ``rel.restrict(restriction)``. + ``rel - restriction`` is equivalent to ``rel.restrict(Not(restriction))``. + + Successive restrictions combine as logical AND. + Collections (except AndList) are treated as OR lists. + + Examples + -------- + >>> table & "session_id > 5" # SQL condition + >>> table & {"subject": "mouse1"} # Equality + >>> table & other_table # Match on common attributes + >>> table - {"status": "failed"} # Anti-restriction """ attributes = set() if isinstance(restriction, Top): @@ -248,11 +302,14 @@ def restrict(self, restriction, semantic_check=True): def restrict_in_place(self, restriction): self.__dict__.update(self.restrict(restriction).__dict__) - def __and__(self, restriction): + def __and__(self, restriction: Any) -> QueryExpression: """ - Restriction operator e.g. ``q1 & q2``. - :return: a restricted copy of the input argument - See QueryExpression.restrict for more detail. + Restriction operator (``&``). + + Returns + ------- + QueryExpression + Restricted copy. See ``restrict()`` for details. """ return self.restrict(restriction) @@ -263,27 +320,38 @@ def __xor__(self, restriction): "Use .restrict(other, semantic_check=False) for restrictions without semantic checking." ) - def __sub__(self, restriction): + def __sub__(self, restriction: Any) -> QueryExpression: """ - Inverted restriction e.g. ``q1 - q2``. - :return: a restricted copy of the input argument - See QueryExpression.restrict for more detail. + Anti-restriction operator (``-``). + + Returns + ------- + QueryExpression + Rows NOT matching the restriction. See ``restrict()`` for details. """ return self.restrict(Not(restriction)) - def __neg__(self): + def __neg__(self) -> Not | QueryExpression: """ - Convert between restriction and inverted restriction e.g. ``-q1``. - :return: target restriction - See QueryExpression.restrict for more detail. + Negation operator (``-expr``). + + Returns + ------- + Not + Negated restriction for use in other expressions. """ if isinstance(self, Not): return self.restriction return Not(self) - def __mul__(self, other): + def __mul__(self, other: QueryExpression) -> QueryExpression: """ - join of query expressions `self` and `other` e.g. ``q1 * q2``. + Join operator (``*``). + + Returns + ------- + QueryExpression + Joined result. See ``join()`` for details. """ return self.join(other) @@ -294,22 +362,33 @@ def __matmul__(self, other): "Use .join(other, semantic_check=False) for joins without semantic checking." ) - def join(self, other, semantic_check=True, left=False, allow_nullable_pk=False): + def join(self, other: QueryExpression | type, semantic_check: bool = True, left: bool = False, allow_nullable_pk: bool = False) -> QueryExpression: """ - Create the joined QueryExpression. - - :param other: QueryExpression to join with - :param semantic_check: If True (default), use semantic matching - only match on - homologous namesakes (same lineage) and error on non-homologous namesakes. - If False, use natural join on all namesakes (no lineage checking). - :param left: If True, perform a left join (retain all rows from self) - :param allow_nullable_pk: If True, bypass the left join constraint that requires - self to determine other. When bypassed, the result PK is the union of both - operands' PKs, and PK attributes from the right operand could be NULL. - Used internally by aggregation with keep_all_rows=True. - :return: The joined QueryExpression - - a * b is short for a.join(b) + Join this expression with another. + + Parameters + ---------- + other : QueryExpression + Expression to join with. + semantic_check : bool, optional + If True (default), use semantic matching - only match on + homologous namesakes (same lineage) and error on non-homologous + namesakes. If False, use natural join on all namesakes. + left : bool, optional + If True, perform a left join (retain all rows from self). + Requires self to determine other. Default False. + allow_nullable_pk : bool, optional + If True, bypass left join constraint. Used internally by + aggregation with keep_all_rows=True. Default False. + + Returns + ------- + QueryExpression + Joined result with combined attributes. + + Notes + ----- + ``a * b`` is equivalent to ``a.join(b)``. """ # Joining with U is no longer supported if isinstance(other, U): @@ -380,69 +459,88 @@ def join(self, other, semantic_check=True, left=False, allow_nullable_pk=False): assert len(result.support) == len(result._joins) + 1 return result - def extend(self, other, semantic_check=True): + def extend(self, other: QueryExpression, semantic_check: bool = True) -> QueryExpression: """ - Extend self with attributes from other. - - The extend operation adds attributes from `other` to `self` while preserving - self's entity identity. It is semantically equivalent to `self.join(other, left=True)` - but expresses a clearer intent: extending an entity set with additional attributes - rather than combining two entity sets. - - Requirements: - self β†’ other: Every attribute in other's primary key must exist in self. - This ensures: - - All rows of self are preserved (no filtering) - - Self's primary key remains the result's primary key (no NULL PKs) - - The operation is a true extension, not a Cartesian product - - Conceptual model: - Unlike a general join (Cartesian product restricted by matching attributes), - extend is closer to projectionβ€”it adds new attributes to existing entities - without changing which entities are in the result. - - Example: - # Session determines Trial (session_id is in Trial's PK) - # But Trial does NOT determine Session (trial_num not in Session) - - # Valid: extend trials with session info - Trial.extend(Session) # Adds 'date' from Session to each Trial - - # Invalid: Session cannot extend to Trial - Session.extend(Trial) # Error: trial_num not in Session - - :param other: QueryExpression whose attributes will extend self - :param semantic_check: If True (default), require homologous namesakes. - If False, match on all namesakes without lineage checking. - :return: Extended QueryExpression with self's PK and combined attributes - :raises DataJointError: If self does not determine other + Extend this expression with attributes from another. + + Adds attributes from ``other`` while preserving this expression's + entity identity. Semantically equivalent to ``self.join(other, left=True)``. + + Parameters + ---------- + other : QueryExpression + Expression whose attributes will extend self. + semantic_check : bool, optional + If True (default), require homologous namesakes. If False, match + on all namesakes without lineage checking. + + Returns + ------- + QueryExpression + Extended result with self's primary key and combined attributes. + + Raises + ------ + DataJointError + If self does not determine other (A β†’ B required). + + Notes + ----- + Requires ``self β†’ other``: every attribute in other's primary key + must exist in self. This ensures: + + - All rows of self are preserved + - Self's primary key remains the result's primary key + - No NULL values in primary key + + Examples + -------- + >>> # Trial determines Session (session_id in Trial's PK) + >>> Trial.extend(Session) # Adds session attrs to each trial """ return self.join(other, semantic_check=semantic_check, left=True) - def __add__(self, other): - """union e.g. ``q1 + q2``.""" + def __add__(self, other: QueryExpression) -> Union: + """ + Union operator (``+``). + + Returns + ------- + Union + Combined entity set with matching primary keys. + """ return Union.create(self, other) - def proj(self, *attributes, **named_attributes): - """ - Projection operator. - - :param attributes: attributes to be included in the result. (The primary key is already included). - :param named_attributes: new attributes computed or renamed from existing attributes. - :return: the projected expression. - Primary key attributes cannot be excluded but may be renamed. - If the attribute list contains an Ellipsis ..., then all secondary attributes are included too - Prefixing an attribute name with a dash '-attr' removes the attribute from the list if present. - Keyword arguments can be used to rename attributes as in name='attr', duplicate them as in name='(attr)', or - self.proj(...) or self.proj(Ellipsis) -- include all attributes (return self) - self.proj() -- include only primary key - self.proj('attr1', 'attr2') -- include primary key and attributes attr1 and attr2 - self.proj(..., '-attr1', '-attr2') -- include all attributes except attr1 and attr2 - self.proj(name1='attr1') -- include primary key and 'attr1' renamed as name1 - self.proj('attr1', dup='(attr1)') -- include primary key and attribute attr1 twice, with the duplicate 'dup' - self.proj(k='abs(attr1)') adds the new attribute k with the value computed as an expression (SQL syntax) - from other attributes available before the projection. - Each attribute name can only be used once. + def proj(self, *attributes: str, **named_attributes: str) -> QueryExpression: + """ + Select, rename, or compute attributes. + + Parameters + ---------- + *attributes : str + Attributes to include (primary key always included). + Use ``...`` (Ellipsis) to include all secondary attributes. + Prefix with ``-`` to exclude (e.g., ``"-attr"``). + **named_attributes : str + New or renamed attributes. Values can be: + + - ``"attr"``: Rename existing attribute + - ``"(attr)"``: Duplicate attribute with new name + - ``"expr"``: SQL expression computing new attribute + + Returns + ------- + QueryExpression + Projected expression with selected attributes. + + Examples + -------- + >>> table.proj() # Primary key only + >>> table.proj(...) # All attributes + >>> table.proj('a', 'b') # PK + 'a' + 'b' + >>> table.proj(..., '-secret') # All except 'secret' + >>> table.proj(new_name='old') # Rename 'old' to 'new_name' + >>> table.proj(total='x + y') # Computed attribute """ named_attributes = {k: translate_attribute(v)[1] for k, v in named_attributes.items()} # new attributes in parentheses are included again with the new name without removing original @@ -538,15 +636,32 @@ def proj(self, *attributes, **named_attributes): ) return result - def aggr(self, group, *attributes, keep_all_rows=False, **named_attributes): + def aggr(self, group: QueryExpression, *attributes: str, keep_all_rows: bool = False, **named_attributes: str) -> QueryExpression: """ - Aggregation of the type U('attr1','attr2').aggr(group, computation="QueryExpression") - has the primary key ('attr1','attr2') and performs aggregation computations for all matching elements of `group`. - - :param group: The query expression to be aggregated. - :param keep_all_rows: True=keep all the rows from self. False=keep only rows that match entries in group. - :param named_attributes: computations of the form new_attribute="sql expression on attributes of group" - :return: The derived query expression + Aggregate data grouped by this expression's primary key. + + Parameters + ---------- + group : QueryExpression + Expression to be aggregated. + *attributes : str + Attributes from self to include (primary key always included). + keep_all_rows : bool, optional + If True, keep all rows from self (left join). If False (default), + keep only rows matching entries in group. + **named_attributes : str + Aggregation computations as SQL expressions, e.g., + ``count="count(*)"``, ``total="sum(value)"``. + + Returns + ------- + QueryExpression + Aggregated result. + + Examples + -------- + >>> Session.aggr(Trial, n_trials="count(*)") + >>> Subject.aggr(Session, ..., first="min(session_date)") """ if Ellipsis in attributes: # expand ellipsis to include only attributes from the left table @@ -592,16 +707,28 @@ def _apply_top(self, order_by=None, limit=None, offset=None): return self.restrict(Top(limit, order_by, offset)) return self - def to_dicts(self, order_by=None, limit=None, offset=None, squeeze=False, download_path="."): + def to_dicts(self, order_by: str | list[str] | None = None, limit: int | None = None, offset: int | None = None, squeeze: bool = False, download_path: str = ".") -> list[dict[str, Any]]: """ Fetch all rows as a list of dictionaries. - :param order_by: attribute(s) to order by, or "KEY"/"KEY DESC" - :param limit: maximum number of rows to return - :param offset: number of rows to skip - :param squeeze: if True, remove extra dimensions from arrays - :param download_path: path for downloading external data (attachments, filepaths) - :return: list of dictionaries, one per row + Parameters + ---------- + order_by : str or list, optional + Attribute(s) to order by. Use ``"KEY"`` or ``"KEY DESC"`` for + primary key ordering. + limit : int, optional + Maximum number of rows to return. + offset : int, optional + Number of rows to skip. + squeeze : bool, optional + If True, remove extra dimensions from arrays. Default False. + download_path : str, optional + Path for downloading external data. Default ``"."``. + + Returns + ------- + list[dict] + One dictionary per row with attribute names as keys. """ expr = self._apply_top(order_by, limit, offset) cursor = expr.cursor(as_dict=True) @@ -611,16 +738,31 @@ def to_dicts(self, order_by=None, limit=None, offset=None, squeeze=False, downlo for row in cursor ] - def to_pandas(self, order_by=None, limit=None, offset=None, squeeze=False, download_path="."): + def to_pandas(self, order_by: str | list[str] | None = None, limit: int | None = None, offset: int | None = None, squeeze: bool = False, download_path: str = ".") -> pandas.DataFrame: """ Fetch all rows as a pandas DataFrame with primary key as index. - :param order_by: attribute(s) to order by, or "KEY"/"KEY DESC" - :param limit: maximum number of rows to return - :param offset: number of rows to skip - :param squeeze: if True, remove extra dimensions from arrays - :param download_path: path for downloading external data - :return: pandas DataFrame with primary key columns as index + Parameters + ---------- + order_by : str or list, optional + Attribute(s) to order by. + limit : int, optional + Maximum number of rows to return. + offset : int, optional + Number of rows to skip. + squeeze : bool, optional + If True, remove extra dimensions from arrays. Default False. + download_path : str, optional + Path for downloading external data. Default ``"."``. + + Returns + ------- + pandas.DataFrame + DataFrame with primary key columns as index. + + See Also + -------- + insert_dataframe : Insert DataFrame back to table. """ dicts = self.to_dicts(order_by=order_by, limit=limit, offset=offset, squeeze=squeeze, download_path=download_path) df = pandas.DataFrame(dicts) @@ -628,18 +770,32 @@ def to_pandas(self, order_by=None, limit=None, offset=None, squeeze=False, downl df = df.set_index(self.primary_key) return df - def to_polars(self, order_by=None, limit=None, offset=None, squeeze=False, download_path="."): + def to_polars(self, order_by: str | list[str] | None = None, limit: int | None = None, offset: int | None = None, squeeze: bool = False, download_path: str = "."): """ Fetch all rows as a polars DataFrame. - Requires polars: pip install datajoint[polars] - - :param order_by: attribute(s) to order by, or "KEY"/"KEY DESC" - :param limit: maximum number of rows to return - :param offset: number of rows to skip - :param squeeze: if True, remove extra dimensions from arrays - :param download_path: path for downloading external data - :return: polars DataFrame + Parameters + ---------- + order_by : str or list, optional + Attribute(s) to order by. + limit : int, optional + Maximum number of rows to return. + offset : int, optional + Number of rows to skip. + squeeze : bool, optional + If True, remove extra dimensions from arrays. Default False. + download_path : str, optional + Path for downloading external data. Default ``"."``. + + Returns + ------- + polars.DataFrame + Polars DataFrame with all rows. + + Raises + ------ + ImportError + If polars is not installed. Install with ``pip install datajoint[polars]``. """ try: import polars @@ -648,18 +804,32 @@ def to_polars(self, order_by=None, limit=None, offset=None, squeeze=False, downl dicts = self.to_dicts(order_by=order_by, limit=limit, offset=offset, squeeze=squeeze, download_path=download_path) return polars.DataFrame(dicts) - def to_arrow(self, order_by=None, limit=None, offset=None, squeeze=False, download_path="."): + def to_arrow(self, order_by: str | list[str] | None = None, limit: int | None = None, offset: int | None = None, squeeze: bool = False, download_path: str = "."): """ Fetch all rows as a PyArrow Table. - Requires pyarrow: pip install datajoint[arrow] - - :param order_by: attribute(s) to order by, or "KEY"/"KEY DESC" - :param limit: maximum number of rows to return - :param offset: number of rows to skip - :param squeeze: if True, remove extra dimensions from arrays - :param download_path: path for downloading external data - :return: pyarrow Table + Parameters + ---------- + order_by : str or list, optional + Attribute(s) to order by. + limit : int, optional + Maximum number of rows to return. + offset : int, optional + Number of rows to skip. + squeeze : bool, optional + If True, remove extra dimensions from arrays. Default False. + download_path : str, optional + Path for downloading external data. Default ``"."``. + + Returns + ------- + pyarrow.Table + PyArrow Table with all rows. + + Raises + ------ + ImportError + If pyarrow is not installed. Install with ``pip install datajoint[arrow]``. """ try: import pyarrow @@ -670,21 +840,39 @@ def to_arrow(self, order_by=None, limit=None, offset=None, squeeze=False, downlo return pyarrow.table({}) return pyarrow.Table.from_pylist(dicts) - def to_arrays(self, *attrs, include_key=False, order_by=None, limit=None, offset=None, squeeze=False, download_path="."): + def to_arrays(self, *attrs: str, include_key: bool = False, order_by: str | list[str] | None = None, limit: int | None = None, offset: int | None = None, squeeze: bool = False, download_path: str = ".") -> np.ndarray | tuple[np.ndarray, ...]: """ Fetch data as numpy arrays. - If no attrs specified, returns a numpy structured array (recarray) of all columns. - If attrs specified, returns a tuple of numpy arrays (one per attribute). - - :param attrs: attribute names to fetch (if empty, fetch all) - :param include_key: if True and attrs specified, include primary key columns - :param order_by: attribute(s) to order by, or "KEY"/"KEY DESC" - :param limit: maximum number of rows to return - :param offset: number of rows to skip - :param squeeze: if True, remove extra dimensions from arrays - :param download_path: path for downloading external data - :return: numpy recarray (no attrs) or tuple of arrays (with attrs) + Parameters + ---------- + *attrs : str + Attribute names to fetch. If empty, fetch all as structured array. + include_key : bool, optional + If True and attrs specified, include primary key columns. Default False. + order_by : str or list, optional + Attribute(s) to order by. + limit : int, optional + Maximum number of rows to return. + offset : int, optional + Number of rows to skip. + squeeze : bool, optional + If True, remove extra dimensions from arrays. Default False. + download_path : str, optional + Path for downloading external data. Default ``"."``. + + Returns + ------- + numpy.ndarray or tuple + If no attrs: structured array (recarray) of all columns. + If single attr: 1D array of values. + If multiple attrs: tuple of arrays. + + Examples + -------- + >>> table.to_arrays() # Structured array of all columns + >>> table.to_arrays('x', 'y') # Tuple of two arrays + >>> x = table.to_arrays('x') # Single array """ from functools import partial @@ -742,37 +930,67 @@ def to_arrays(self, *attrs, include_key=False, order_by=None, limit=None, offset ret[name] = list(map(partial(get, heading[name]), ret[name])) return ret - def keys(self, order_by=None, limit=None, offset=None): + def keys(self, order_by: str | list[str] | None = None, limit: int | None = None, offset: int | None = None) -> list[dict[str, Any]]: """ Fetch primary key values as a list of dictionaries. - :param order_by: attribute(s) to order by, or "KEY"/"KEY DESC" - :param limit: maximum number of rows to return - :param offset: number of rows to skip - :return: list of dictionaries containing only primary key columns + Parameters + ---------- + order_by : str or list, optional + Attribute(s) to order by. + limit : int, optional + Maximum number of rows to return. + offset : int, optional + Number of rows to skip. + + Returns + ------- + list[dict] + Primary key values only. """ return self.proj().to_dicts(order_by=order_by, limit=limit, offset=offset) - def head(self, limit=25): + def head(self, limit: int = 25) -> list[dict[str, Any]]: """ - Preview the first few entries from query expression. + Preview the first entries from the query result. + + Parameters + ---------- + limit : int, optional + Number of entries to return. Default 25. - :param limit: number of entries (default 25) - :return: list of dictionaries + Returns + ------- + list[dict] + First entries ordered by primary key. """ return self.to_dicts(order_by="KEY", limit=limit) - def tail(self, limit=25): + def tail(self, limit: int = 25) -> list[dict[str, Any]]: """ - Preview the last few entries from query expression. + Preview the last entries from the query result. - :param limit: number of entries (default 25) - :return: list of dictionaries + Parameters + ---------- + limit : int, optional + Number of entries to return. Default 25. + + Returns + ------- + list[dict] + Last entries ordered by primary key. """ return list(reversed(self.to_dicts(order_by="KEY DESC", limit=limit))) - def __len__(self): - """:return: number of elements in the result set e.g. ``len(q1)``.""" + def __len__(self) -> int: + """ + Return the number of rows in the result set. + + Returns + ------- + int + Row count. + """ result = self.make_subquery() if self._top else copy.copy(self) has_left_join = any(is_left for is_left, _ in result._joins) return result.connection.query( @@ -789,10 +1007,14 @@ def __len__(self): ) ).fetchone()[0] - def __bool__(self): + def __bool__(self) -> bool: """ - :return: True if the result is not empty. Equivalent to len(self) > 0 but often - faster e.g. ``bool(q1)``. + Check if the result set is non-empty. + + Returns + ------- + bool + True if at least one row exists. More efficient than ``len(self) > 0``. """ return bool( self.connection.query( diff --git a/src/datajoint/fetch.py b/src/datajoint/fetch.py index 97df03ba4..713ff0be7 100644 --- a/src/datajoint/fetch.py +++ b/src/datajoint/fetch.py @@ -15,21 +15,33 @@ def _get(connection, attr, data, squeeze, download_path): Retrieve and decode attribute data from the database. In the simplified type system: + - Native types pass through unchanged - JSON types are parsed - UUID types are converted from bytes - Blob types return raw bytes (unless a codec handles them) - Codecs handle all custom encoding/decoding via type chains - For composed types (e.g., using ), decoders are applied - in reverse order: innermost first, then outermost. - - :param connection: a dj.Connection object - :param attr: attribute from the table's heading - :param data: raw value fetched from the database - :param squeeze: if True squeeze blobs (legacy, unused) - :param download_path: for fetches that download data (attachments, filepaths) - :return: decoded data + For composed types (e.g., ```` using ````), decoders are + applied in reverse order: innermost first, then outermost. + + Parameters + ---------- + connection : dj.Connection + Database connection. + attr : Attribute + Attribute from the table's heading. + data : any + Raw value fetched from the database. + squeeze : bool + If True, squeeze singleton dimensions from arrays. + download_path : str + Path for downloading external data (attachments, filepaths). + + Returns + ------- + any + Decoded data in Python-native format. """ from .settings import config @@ -88,9 +100,12 @@ def _get(connection, attr, data, squeeze, download_path): class Fetch1: """ - Fetch object for fetching the result of a query yielding exactly one row. + Fetch handler for queries that return exactly one row. - :param expression: a query expression to fetch from. + Parameters + ---------- + expression : QueryExpression + Query expression to fetch from. """ def __init__(self, expression): @@ -98,21 +113,36 @@ def __init__(self, expression): def __call__(self, *attrs, squeeze=False, download_path="."): """ - Fetches the result of a query expression that yields exactly one entry. - - If no attributes are specified, returns the result as a dict. - If attributes are specified returns the corresponding results as a tuple. - - Examples: - d = rel.fetch1() # returns dict with all attributes - a, b = rel.fetch1('a', 'b') # returns tuple of attribute values - - :param attrs: attributes to return when expanding into a tuple. - If empty, returns a dict with all attributes. - :param squeeze: when True, remove extra dimensions from arrays - :param download_path: for fetches that download data, e.g. attachments - :return: dict (no attrs) or tuple/value (with attrs) - :raises DataJointError: if not exactly one row in result + Fetch exactly one row from the query result. + + Parameters + ---------- + *attrs : str + Attribute names to return. If empty, returns all as dict. + Use ``"KEY"`` to fetch primary key as a dict. + squeeze : bool, optional + If True, remove singleton dimensions from arrays. Default False. + download_path : str, optional + Path for downloading external data. Default ``"."``. + + Returns + ------- + dict or tuple or value + If no attrs: dict with all attributes. + If one attr: single value. + If multiple attrs: tuple of values. + + Raises + ------ + DataJointError + If query does not return exactly one row. + + Examples + -------- + >>> row = table.fetch1() # dict with all attributes + >>> a, b = table.fetch1('a', 'b') # tuple of values + >>> x = table.fetch1('x') # single value + >>> pk = table.fetch1('KEY') # primary key dict """ heading = self._expression.heading diff --git a/src/datajoint/heading.py b/src/datajoint/heading.py index 8b0004f29..7d35c8263 100644 --- a/src/datajoint/heading.py +++ b/src/datajoint/heading.py @@ -1,7 +1,17 @@ +""" +Heading management for DataJoint tables. + +This module provides the Heading class for managing table column metadata, +including attribute types, constraints, and lineage information. +""" + +from __future__ import annotations + import logging import re from collections import defaultdict, namedtuple from itertools import chain +from typing import TYPE_CHECKING, Any import numpy as np @@ -15,6 +25,9 @@ from .errors import DataJointError from .lineage import get_table_lineages, lineage_table_exists +if TYPE_CHECKING: + from .connection import Connection + class _MissingType(Codec, register=False): """Placeholder for missing/unregistered codecs. Raises error on use.""" @@ -70,40 +83,85 @@ def decode(self, stored, *, key=None): class Attribute(namedtuple("_Attribute", default_attribute_properties)): """ - Properties of a table column (attribute) + Properties of a table column (attribute). + + Attributes + ---------- + name : str + Attribute name. + type : str + Database type string. + in_key : bool + True if part of primary key. + nullable : bool + True if NULL values allowed. + default : any + Default value. + comment : str + Attribute comment/description. + codec : Codec + Codec for encoding/decoding values. + lineage : str + Origin of attribute for semantic matching. """ - def todict(self): - """Convert namedtuple to dict.""" + def todict(self) -> dict[str, Any]: + """Convert to dictionary.""" return dict((name, self[i]) for i, name in enumerate(self._fields)) @property - def sql_type(self): - """:return: datatype (as string) in database. In most cases, it is the same as self.type""" + def sql_type(self) -> str: + """ + Return the SQL datatype string. + + Returns + ------- + str + Database type (usually same as self.type). + """ # UUID is now a core type alias - already resolved to binary(16) return self.type @property - def sql_comment(self): - """:return: full comment for the SQL declaration. Includes custom type specification""" + def sql_comment(self) -> str: + """ + Return the full SQL comment including type markers. + + Returns + ------- + str + Comment with optional ``:uuid:`` prefix. + """ # UUID info is stored in the comment for reconstruction return (":uuid:" if self.uuid else "") + self.comment @property - def sql(self): + def sql(self) -> str: """ - Convert primary key attribute tuple into its SQL CREATE TABLE clause. - Default values are not reflected. - This is used for declaring foreign keys in referencing tables + Generate SQL clause for this attribute in CREATE TABLE. + + Used for declaring foreign keys in referencing tables. + Default values are not included. - :return: SQL code for attribute declaration + Returns + ------- + str + SQL attribute declaration. """ return '`{name}` {type} NOT NULL COMMENT "{comment}"'.format( name=self.name, type=self.sql_type, comment=self.sql_comment ) @property - def original_name(self): + def original_name(self) -> str: + """ + Return the original attribute name before any renaming. + + Returns + ------- + str + Original name from attribute_expression or current name. + """ if self.attribute_expression is None: return self.name assert self.attribute_expression.startswith("`") @@ -112,18 +170,32 @@ def original_name(self): class Heading: """ - Local class for table headings. - Heading contains the property attributes, which is an dict in which the keys are - the attribute names and the values are Attributes. + Table heading containing column metadata. + + Manages attribute information including names, types, constraints, + and lineage for semantic matching. + + Parameters + ---------- + attribute_specs : list, optional + List of attribute specification dictionaries. + table_info : dict, optional + Database table information for lazy loading. + lineage_available : bool, optional + Whether lineage information is available. Default True. + + Attributes + ---------- + attributes : dict + Mapping of attribute names to Attribute objects. """ - def __init__(self, attribute_specs=None, table_info=None, lineage_available=True): - """ - - :param attribute_specs: a list of dicts with the same keys as Attribute - :param table_info: a dict with information to load the heading from the database - :param lineage_available: whether lineage tracking is available for this heading - """ + def __init__( + self, + attribute_specs: list[dict] | None = None, + table_info: dict | None = None, + lineage_available: bool = True, + ) -> None: self.indexes = None self.table_info = table_info self._table_status = None @@ -131,15 +203,16 @@ def __init__(self, attribute_specs=None, table_info=None, lineage_available=True self._attributes = None if attribute_specs is None else dict((q["name"], Attribute(**q)) for q in attribute_specs) @property - def lineage_available(self): + def lineage_available(self) -> bool: """Whether lineage tracking is available for this heading's schema.""" return self._lineage_available - def __len__(self): + def __len__(self) -> int: return 0 if self.attributes is None else len(self.attributes) @property - def table_status(self): + def table_status(self) -> dict | None: + """Table status information from database.""" if self.table_info is None: return None if self._table_status is None: @@ -147,59 +220,73 @@ def table_status(self): return self._table_status @property - def attributes(self): + def attributes(self) -> dict[str, Attribute]: + """ + Mapping of attribute names to Attribute objects. + + Excludes hidden attributes (names starting with ``_``). + """ if self._attributes is None: self._init_from_database() # lazy loading from database return {k: v for k, v in self._attributes.items() if not v.is_hidden} @property - def names(self): + def names(self) -> list[str]: + """List of visible attribute names.""" return [k for k in self.attributes] @property - def primary_key(self): + def primary_key(self) -> list[str]: + """List of primary key attribute names.""" return [k for k, v in self.attributes.items() if v.in_key] @property - def secondary_attributes(self): + def secondary_attributes(self) -> list[str]: + """List of non-primary-key attribute names.""" return [k for k, v in self.attributes.items() if not v.in_key] - def determines(self, other): + def determines(self, other: Heading) -> bool: """ Check if self determines other (self β†’ other). - A determines B iff every attribute in PK(B) is in A. + A determines B iff every attribute in PK(B) is in A. This means + knowing A's primary key is sufficient to determine B's primary key + through functional dependencies. - This means knowing A's primary key is sufficient to determine B's primary key - through the functional dependencies implied by A's structure. + Parameters + ---------- + other : Heading + Another Heading object. - :param other: Another Heading object - :return: True if self determines other + Returns + ------- + bool + True if self determines other. """ self_attrs = set(self.names) return all(attr in self_attrs for attr in other.primary_key) @property - def blobs(self): + def blobs(self) -> list[str]: + """List of blob attribute names.""" return [k for k, v in self.attributes.items() if v.is_blob] @property - def non_blobs(self): - """Attributes that are not blobs or JSON (used for simple column handling).""" + def non_blobs(self) -> list[str]: + """Attributes that are not blobs or JSON.""" return [k for k, v in self.attributes.items() if not (v.is_blob or v.json)] @property - def new_attributes(self): + def new_attributes(self) -> list[str]: + """Attributes with computed expressions (projections).""" return [k for k, v in self.attributes.items() if v.attribute_expression is not None] - def __getitem__(self, name): - """shortcut to the attribute""" + def __getitem__(self, name: str) -> Attribute: + """Get attribute by name.""" return self.attributes[name] - def __repr__(self): - """ - :return: heading representation in DataJoint declaration format but without foreign key expansion - """ + def __repr__(self) -> str: + """Return heading in DataJoint declaration format.""" in_key = True ret = "" if self._table_status is not None: @@ -216,19 +303,37 @@ def __repr__(self): return ret @property - def has_autoincrement(self): + def has_autoincrement(self) -> bool: + """Check if any attribute has auto_increment.""" return any(e.autoincrement for e in self.attributes.values()) @property - def as_dtype(self): + def as_dtype(self) -> np.dtype: """ - represent the heading as a numpy dtype + Return heading as a numpy dtype. + + Returns + ------- + numpy.dtype + Structured dtype for creating numpy arrays. """ return np.dtype(dict(names=self.names, formats=[v.dtype for v in self.attributes.values()])) - def as_sql(self, fields, include_aliases=True): + def as_sql(self, fields: list[str], include_aliases: bool = True) -> str: """ - represent heading as the SQL SELECT clause. + Generate SQL SELECT clause for specified fields. + + Parameters + ---------- + fields : list[str] + Attribute names to include. + include_aliases : bool, optional + Include AS clauses for computed attributes. Default True. + + Returns + ------- + str + Comma-separated SQL field list. """ return ",".join( ( @@ -242,8 +347,8 @@ def as_sql(self, fields, include_aliases=True): def __iter__(self): return iter(self.attributes) - def _init_from_database(self): - """initialize heading from an existing database table.""" + def _init_from_database(self) -> None: + """Initialize heading from an existing database table.""" conn, database, table_name, context = (self.table_info[k] for k in ("conn", "database", "table_name", "context")) info = conn.query( 'SHOW TABLE STATUS FROM `{database}` WHERE name="{table_name}"'.format(table_name=table_name, database=database), diff --git a/src/datajoint/jobs.py b/src/datajoint/jobs.py index 24a9e47a6..57533b8f3 100644 --- a/src/datajoint/jobs.py +++ b/src/datajoint/jobs.py @@ -2,10 +2,12 @@ Job queue management for AutoPopulate 2.0. Each auto-populated table (Computed/Imported) has an associated jobs table -with the naming pattern ~~table_name. The jobs table tracks job status, +with the naming pattern ``~~table_name``. The jobs table tracks job status, priority, scheduling, and error information. """ +from __future__ import annotations + import logging import os import platform @@ -26,7 +28,9 @@ def _get_job_version() -> str: """ Get version string based on config settings. - Returns: + Returns + ------- + str Version string, or empty string if version tracking disabled. """ from .settings import config @@ -53,21 +57,44 @@ class Job(Table): Per-table job queue for AutoPopulate 2.0. Each auto-populated table (Computed/Imported) has an associated job table - with the naming pattern ~~table_name. The job table tracks job status, + with the naming pattern ``~~table_name``. The job table tracks job status, priority, scheduling, and error information. - Access via the `jobs` property on any auto-populated table: - MyTable.jobs.refresh() - MyTable.jobs.pending - MyTable.jobs.errors + Parameters + ---------- + target_table : Table + The Computed/Imported table instance this jobs table manages. + + Attributes + ---------- + target : Table + The auto-populated table this jobs table manages. + pending : QueryExpression + Query for jobs with ``status='pending'``. + reserved : QueryExpression + Query for jobs with ``status='reserved'``. + errors : QueryExpression + Query for jobs with ``status='error'``. + completed : QueryExpression + Query for jobs with ``status='success'``. + ignored : QueryExpression + Query for jobs with ``status='ignore'``. + + Examples + -------- + >>> MyTable.jobs.refresh() # Add new jobs, clean up stale ones + >>> MyTable.jobs.pending # Query pending jobs + >>> MyTable.jobs.errors # Query failed jobs """ - def __init__(self, target_table): + def __init__(self, target_table: Table) -> None: """ Initialize jobs table for an auto-populated table. - Args: - target_table: The Computed/Imported table instance this jobs table manages. + Parameters + ---------- + target_table : Table + The Computed/Imported table instance this jobs table manages. """ self._target = target_table self._connection = target_table.connection @@ -109,7 +136,9 @@ def _generate_definition(self) -> str: """ Generate jobs table definition from target's FK-derived primary key. - Returns: + Returns + ------- + str DataJoint table definition string. """ pk_attrs = self._get_fk_derived_pk_attrs() @@ -148,7 +177,9 @@ def _get_fk_derived_pk_attrs(self) -> list[tuple[str, str]]: FK-derived attributes are those that come from primary FK references. Uses connection.dependencies to identify FK relationships. - Returns: + Returns + ------- + list[tuple[str, str]] List of (attribute_name, datatype) tuples in target PK order. """ heading = self._target.heading @@ -181,15 +212,27 @@ def _get_fk_derived_pk_attrs(self) -> list[tuple[str, str]]: return fk_attrs def _get_pk(self, key: dict) -> dict: - """Extract primary key values from a key dict.""" + """ + Extract primary key values from a key dict. + + Parameters + ---------- + key : dict + Dictionary containing at least the primary key attributes. + + Returns + ------- + dict + Dictionary with only the primary key attributes. + """ return {k: key[k] for k in self.primary_key if k in key} - def delete(self): - """Bypass interactive prompts and dependencies.""" + def delete(self) -> None: + """Delete all entries, bypassing interactive prompts and dependencies.""" self.delete_quick() - def drop(self): - """Bypass interactive prompts and dependencies.""" + def drop(self) -> None: + """Drop the table, bypassing interactive prompts and dependencies.""" self.drop_quick() # ------------------------------------------------------------------------- @@ -197,28 +240,63 @@ def drop(self): # ------------------------------------------------------------------------- @property - def pending(self): - """Return query for pending jobs.""" + def pending(self) -> "Job": + """ + Query for pending jobs awaiting processing. + + Returns + ------- + Job + Restricted query with ``status='pending'``. + """ return self & 'status="pending"' @property - def reserved(self): - """Return query for reserved jobs.""" + def reserved(self) -> "Job": + """ + Query for jobs currently being processed. + + Returns + ------- + Job + Restricted query with ``status='reserved'``. + """ return self & 'status="reserved"' @property - def errors(self): - """Return query for error jobs.""" + def errors(self) -> "Job": + """ + Query for jobs that failed with errors. + + Returns + ------- + Job + Restricted query with ``status='error'``. + """ return self & 'status="error"' @property - def ignored(self): - """Return query for ignored jobs.""" + def ignored(self) -> "Job": + """ + Query for jobs marked to be skipped. + + Returns + ------- + Job + Restricted query with ``status='ignore'``. + """ return self & 'status="ignore"' @property - def completed(self): - """Return query for completed (success) jobs.""" + def completed(self) -> "Job": + """ + Query for successfully completed jobs. + + Returns + ------- + Job + Restricted query with ``status='success'``. + """ return self & 'status="success"' # ------------------------------------------------------------------------- @@ -236,33 +314,39 @@ def refresh( """ Refresh the jobs queue: add new jobs and clean up stale/orphaned jobs. + Parameters + ---------- + *restrictions : any + Conditions to filter key_source (for adding new jobs). + delay : float, optional + Seconds from now until new jobs become available for processing. + Default 0 (immediately available). Uses database server time. + priority : int, optional + Priority for new jobs (lower = more urgent). + Default from ``config.jobs.default_priority``. + stale_timeout : float, optional + Seconds after which jobs are checked for staleness. + Jobs older than this are removed if key not in key_source. + Default from ``config.jobs.stale_timeout``. Set to 0 to skip. + orphan_timeout : float, optional + Seconds after which reserved jobs are considered orphaned. + Reserved jobs older than this are deleted and re-added as pending. + Default None (no orphan cleanup). + + Returns + ------- + dict + Status counts with keys: ``'added'``, ``'removed'``, + ``'orphaned'``, ``'re_pended'``. + + Notes + ----- Operations performed: - 1. Add new jobs: (key_source & restrictions) - target - jobs -> insert as 'pending' - 2. Re-pend success jobs: if keep_completed=True and key in key_source but not in target + + 1. Add new jobs: ``(key_source & restrictions) - target - jobs`` β†’ insert as pending + 2. Re-pend success jobs: if ``keep_completed=True`` and key in key_source but not in target 3. Remove stale jobs: jobs older than stale_timeout whose keys not in key_source 4. Remove orphaned jobs: reserved jobs older than orphan_timeout (if specified) - - Args: - restrictions: Conditions to filter key_source (for adding new jobs). - delay: Seconds from now until new jobs become available for processing. - Default: 0 (immediately available). Uses database server time. - priority: Priority for new jobs (lower = more urgent). - Default from config.jobs.default_priority. - stale_timeout: Seconds after which jobs are checked for staleness. - Jobs older than this are removed if key not in key_source. - Default from config.jobs.stale_timeout. - Set to 0 to skip stale cleanup. - orphan_timeout: Seconds after which reserved jobs are considered orphaned. - Reserved jobs older than this are deleted and re-added as pending. - Default: None (no orphan cleanup - must be explicit). - - Returns: - { - 'added': int, # New pending jobs added - 'removed': int, # Stale jobs removed - 'orphaned': int, # Orphaned jobs reset to pending - 're_pended': int # Success jobs re-pended (keep_completed mode) - } """ from datetime import datetime, timedelta @@ -346,12 +430,17 @@ def reserve(self, key: dict) -> bool: """ Attempt to reserve a pending job for processing. - Updates status to 'reserved' if currently 'pending' and scheduled_time <= now. + Updates status to ``'reserved'`` if currently ``'pending'`` and + ``scheduled_time <= now``. - Args: - key: Primary key dict of the job to reserve. + Parameters + ---------- + key : dict + Primary key dict of the job to reserve. - Returns: + Returns + ------- + bool True if reservation successful, False if job not available. """ from datetime import datetime @@ -386,13 +475,19 @@ def complete(self, key: dict, duration: float | None = None) -> None: """ Mark a job as successfully completed. - Based on config.jobs.keep_completed: - - If True: updates status to 'success' with completion time and duration - - If False: deletes the job entry + Parameters + ---------- + key : dict + Primary key dict of the job. + duration : float, optional + Execution duration in seconds. - Args: - key: Primary key dict of the job. - duration: Execution duration in seconds. + Notes + ----- + Based on ``config.jobs.keep_completed``: + + - If True: updates status to ``'success'`` with completion time and duration + - If False: deletes the job entry """ from datetime import datetime @@ -415,10 +510,14 @@ def error(self, key: dict, error_message: str, error_stack: str | None = None) - """ Mark a job as failed with error details. - Args: - key: Primary key dict of the job. - error_message: Error message (truncated to 2047 chars). - error_stack: Full stack trace. + Parameters + ---------- + key : dict + Primary key dict of the job. + error_message : str + Error message (truncated to 2047 chars if longer). + error_stack : str, optional + Full stack trace. """ from datetime import datetime @@ -441,11 +540,13 @@ def ignore(self, key: dict) -> None: """ Mark a job to be ignored (skipped during populate). - If the key doesn't exist in the jobs table, inserts it with status='ignore'. - If it exists, updates the status to 'ignore'. + If the key doesn't exist in the jobs table, inserts it with + ``status='ignore'``. If it exists, updates the status to ``'ignore'``. - Args: - key: Primary key dict of the job. + Parameters + ---------- + key : dict + Primary key dict of the job. """ from .settings import config @@ -460,15 +561,11 @@ def progress(self) -> dict: """ Return job status breakdown. - Returns: - { - 'pending': int, - 'reserved': int, - 'success': int, - 'error': int, - 'ignore': int, - 'total': int - } + Returns + ------- + dict + Counts by status with keys: ``'pending'``, ``'reserved'``, + ``'success'``, ``'error'``, ``'ignore'``, ``'total'``. """ if not self.is_declared: return { diff --git a/src/datajoint/schemas.py b/src/datajoint/schemas.py index f9c925440..513225879 100644 --- a/src/datajoint/schemas.py +++ b/src/datajoint/schemas.py @@ -1,3 +1,12 @@ +""" +Schema management for DataJoint. + +This module provides the Schema class for binding Python table classes to +database schemas, and utilities for schema introspection and management. +""" + +from __future__ import annotations + import collections import inspect import itertools @@ -5,9 +14,14 @@ import re import types import warnings +from typing import TYPE_CHECKING, Any from .connection import conn from .errors import AccessError, DataJointError + +if TYPE_CHECKING: + from .connection import Connection + from .table import Table from .heading import Heading from .jobs import Job from .settings import config @@ -18,13 +32,22 @@ logger = logging.getLogger(__name__.split(".")[0]) -def ordered_dir(class_): +def ordered_dir(class_: type) -> list[str]: """ - List (most) attributes of the class including inherited ones, similar to `dir` built-in function, - but respects order of attribute declaration as much as possible. + List class attributes respecting declaration order. + + Similar to the ``dir()`` built-in, but preserves attribute declaration + order as much as possible. + + Parameters + ---------- + class_ : type + Class to list members for. - :param class_: class to list members for - :return: a list of attributes declared in class_ and its superclasses + Returns + ------- + list[str] + Attributes declared in class_ and its superclasses. """ attr_list = list() for c in reversed(class_.mro()): @@ -34,34 +57,63 @@ def ordered_dir(class_): class Schema: """ - A schema object is a decorator for UserTable classes that binds them to their database. - It also specifies the namespace `context` in which other UserTable classes are defined. + Decorator that binds table classes to a database schema. + + Schema objects associate Python table classes with database schemas and + provide the namespace context for foreign key resolution. + + Parameters + ---------- + schema_name : str, optional + Database schema name. If omitted, call ``activate()`` later. + context : dict, optional + Namespace for foreign key lookup. None uses caller's context. + connection : Connection, optional + Database connection. Defaults to ``dj.conn()``. + create_schema : bool, optional + If False, raise error if schema doesn't exist. Default True. + create_tables : bool, optional + If False, raise error when accessing missing tables. Default True. + add_objects : dict, optional + Additional objects for the declaration context. + + Examples + -------- + >>> schema = dj.Schema('my_schema') + >>> @schema + ... class Session(dj.Manual): + ... definition = ''' + ... session_id : int + ... ''' """ def __init__( self, - schema_name=None, - context=None, + schema_name: str | None = None, + context: dict[str, Any] | None = None, *, - connection=None, - create_schema=True, - create_tables=True, - add_objects=None, - ): - """ - Associate database schema `schema_name`. If the schema does not exist, attempt to - create it on the server. - - If the schema_name is omitted, then schema.activate(..) must be called later - to associate with the database. - - :param schema_name: the database schema to associate. - :param context: dictionary for looking up foreign key references, leave None to use local context. - :param connection: Connection object. Defaults to datajoint.conn(). - :param create_schema: When False, do not create the schema and raise an error if missing. - :param create_tables: When False, do not create tables and raise errors when accessing missing tables. - :param add_objects: a mapping with additional objects to make available to the context in which table classes - are declared. + connection: Connection | None = None, + create_schema: bool = True, + create_tables: bool = True, + add_objects: dict[str, Any] | None = None, + ) -> None: + """ + Initialize the schema object. + + Parameters + ---------- + schema_name : str, optional + Database schema name. If omitted, call ``activate()`` later. + context : dict, optional + Namespace for foreign key lookup. None uses caller's context. + connection : Connection, optional + Database connection. Defaults to ``dj.conn()``. + create_schema : bool, optional + If False, raise error if schema doesn't exist. Default True. + create_tables : bool, optional + If False, raise error when accessing missing tables. Default True. + add_objects : dict, optional + Additional objects for the declaration context. """ self.connection = connection self.database = None @@ -73,30 +125,42 @@ def __init__( if schema_name: self.activate(schema_name) - def is_activated(self): + def is_activated(self) -> bool: + """Check if the schema has been activated.""" return self.database is not None def activate( self, - schema_name=None, + schema_name: str | None = None, *, - connection=None, - create_schema=None, - create_tables=None, - add_objects=None, - ): - """ - Associate database schema `schema_name`. If the schema does not exist, attempt to - create it on the server. - - :param schema_name: the database schema to associate. - schema_name=None is used to assert that the schema has already been activated. - :param connection: Connection object. Defaults to datajoint.conn(). - :param create_schema: If False, do not create the schema and raise an error if missing. - :param create_tables: If False, do not create tables and raise errors when attempting - to access missing tables. - :param add_objects: a mapping with additional objects to make available to the context - in which table classes are declared. + connection: Connection | None = None, + create_schema: bool | None = None, + create_tables: bool | None = None, + add_objects: dict[str, Any] | None = None, + ) -> None: + """ + Associate with a database schema. + + If the schema does not exist, attempts to create it on the server. + + Parameters + ---------- + schema_name : str, optional + Database schema name. None asserts schema is already activated. + connection : Connection, optional + Database connection. Defaults to ``dj.conn()``. + create_schema : bool, optional + If False, raise error if schema doesn't exist. + create_tables : bool, optional + If False, raise error when accessing missing tables. + add_objects : dict, optional + Additional objects for the declaration context. + + Raises + ------ + DataJointError + If schema_name is None and schema not yet activated, or if + schema already activated for a different database. """ if schema_name is None: if self.exists: @@ -144,12 +208,26 @@ def _assert_exists(self, message=None): if not self.exists: raise DataJointError(message or "Schema `{db}` has not been created.".format(db=self.database)) - def __call__(self, cls, *, context=None): + def __call__(self, cls: type, *, context: dict[str, Any] | None = None) -> type: """ - Binds the supplied class to a schema. This is intended to be used as a decorator. + Bind a table class to this schema. Used as a decorator. - :param cls: class to decorate. - :param context: supplied when called from spawn_missing_classes + Parameters + ---------- + cls : type + Table class to decorate. + context : dict, optional + Declaration context. Supplied by spawn_missing_classes. + + Returns + ------- + type + The decorated class. + + Raises + ------ + DataJointError + If applied to a Part table (use on master only). """ context = context or self.context or inspect.currentframe().f_back.f_locals if issubclass(cls, Part): @@ -160,11 +238,16 @@ def __call__(self, cls, *, context=None): self.declare_list.append((cls, context)) return cls - def _decorate_master(self, cls, context): + def _decorate_master(self, cls: type, context: dict[str, Any]) -> None: """ + Process a master table class and its part tables. - :param cls: the master class to process - :param context: the class' declaration context + Parameters + ---------- + cls : type + Master table class to process. + context : dict + Declaration context for foreign key resolution. """ self._decorate_table(cls, context=dict(context, self=cls, **{cls.__name__: cls})) # Process part tables @@ -179,9 +262,18 @@ def _decorate_master(self, cls, context): context=dict(context, master=cls, self=part, **{cls.__name__: cls}), ) - def _decorate_table(self, table_class, context, assert_declared=False): + def _decorate_table(self, table_class: type, context: dict[str, Any], assert_declared: bool = False) -> None: """ - assign schema properties to the table class and declare the table + Assign schema properties to the table class and declare the table. + + Parameters + ---------- + table_class : type + Table class to decorate. + context : dict + Declaration context for foreign key resolution. + assert_declared : bool, optional + If True, assert table is already declared. Default False. """ table_class.database = self.database table_class._connection = self.connection @@ -225,9 +317,14 @@ def __repr__(self): return "Schema `{name}`\n".format(name=self.database) @property - def size_on_disk(self): + def size_on_disk(self) -> int: """ - :return: size of the entire schema in bytes + Return the total size of all tables in the schema. + + Returns + ------- + int + Size in bytes (data + indices). """ self._assert_exists() return int( @@ -239,12 +336,19 @@ def size_on_disk(self): ).fetchone()[0] ) - def spawn_missing_classes(self, context=None): + def spawn_missing_classes(self, context: dict[str, Any] | None = None) -> None: """ - Creates the appropriate python user table classes from tables in the schema and places them - in the context. + Create Python table classes for tables without existing classes. + + Introspects the database schema and creates appropriate Python classes + (Lookup, Manual, Imported, Computed, Part) for tables that don't have + corresponding classes in the context. - :param context: alternative context to place the missing classes into, e.g. locals() + Parameters + ---------- + context : dict, optional + Namespace to place created classes into. Defaults to caller's + local namespace. """ self._assert_exists() if context is None: @@ -287,9 +391,19 @@ def spawn_missing_classes(self, context=None): self._decorate_table(part_class, context=context, assert_declared=True) setattr(master_class, class_name, part_class) - def drop(self, force=False): + def drop(self, force: bool = False) -> None: """ - Drop the associated schema if it exists + Drop the associated schema and all its tables. + + Parameters + ---------- + force : bool, optional + If True, skip confirmation prompt. Default False. + + Raises + ------ + AccessError + If insufficient permissions to drop the schema. """ if not self.exists: logger.info("Schema named `{database}` does not exist. Doing nothing.".format(database=self.database)) @@ -308,9 +422,19 @@ def drop(self, force=False): ) @property - def exists(self): + def exists(self) -> bool: """ - :return: true if the associated schema exists on the server + Check if the associated schema exists on the server. + + Returns + ------- + bool + True if the schema exists. + + Raises + ------ + DataJointError + If schema has not been activated. """ if self.database is None: raise DataJointError("Schema must be activated first.") @@ -323,9 +447,14 @@ def exists(self): ) @property - def lineage_table_exists(self): + def lineage_table_exists(self) -> bool: """ - :return: true if the ~lineage table exists in this schema + Check if the ~lineage table exists in this schema. + + Returns + ------- + bool + True if the lineage table exists. """ from .lineage import lineage_table_exists @@ -333,29 +462,34 @@ def lineage_table_exists(self): return lineage_table_exists(self.connection, self.database) @property - def lineage(self): + def lineage(self) -> dict[str, str]: """ Get all lineages for tables in this schema. - :return: A dict mapping 'schema.table.attribute' to its lineage + Returns + ------- + dict[str, str] + Mapping of ``'schema.table.attribute'`` to its lineage origin. """ from .lineage import get_schema_lineages self._assert_exists() return get_schema_lineages(self.connection, self.database) - def rebuild_lineage(self): + def rebuild_lineage(self) -> None: """ Rebuild the ~lineage table for all tables in this schema. - This recomputes lineage for all attributes by querying FK relationships - from the information_schema. Use this to restore lineage for schemas - that predate the lineage system or after corruption. + Recomputes lineage for all attributes by querying FK relationships + from the information_schema. Use to restore lineage for schemas that + predate the lineage system or after corruption. + Notes + ----- After rebuilding, restart the Python kernel and reimport to pick up the new lineage information. - Note: Upstream schemas (referenced via cross-schema foreign keys) must + Upstream schemas (referenced via cross-schema foreign keys) must have their lineage rebuilt first. """ from .lineage import rebuild_schema_lineage @@ -364,15 +498,19 @@ def rebuild_lineage(self): rebuild_schema_lineage(self.connection, self.database) @property - def jobs(self): + def jobs(self) -> list[Job]: """ - Return list of Job objects for auto-populated tables that have job tables. + Return Job objects for auto-populated tables with job tables. - Only returns Job objects when both the target table and its ~~table_name - job table exist in the database. Job tables are created lazily on first - access to table.jobs or populate(reserve_jobs=True). + Only returns Job objects when both the target table and its + ``~~table_name`` job table exist in the database. Job tables are + created lazily on first access to ``table.jobs`` or + ``populate(reserve_jobs=True)``. - :return: list of Job objects for existing job tables + Returns + ------- + list[Job] + Job objects for existing job tables. """ self._assert_exists() jobs_list = [] @@ -400,12 +538,24 @@ def code(self): self._assert_exists() return self.save() - def save(self, python_filename=None): + def save(self, python_filename: str | None = None) -> str: """ - Generate the code for a module that recreates the schema. - This method is in preparation for a future release and is not officially supported. + Generate Python code that recreates this schema. - :return: a string containing the body of a complete Python module defining this schema. + Parameters + ---------- + python_filename : str, optional + If provided, write the code to this file. + + Returns + ------- + str + Python module source code defining this schema. + + Notes + ----- + This method is in preparation for a future release and is not + officially supported. """ self.connection.dependencies.load() self._assert_exists() @@ -460,12 +610,17 @@ def replace(s): with open(python_filename, "wt") as f: f.write(python_code) - def list_tables(self): + def list_tables(self) -> list[str]: """ - Return a list of all tables in the schema except tables with ~ in first character such - as ~logs and ~job + Return all user tables in the schema. + + Excludes hidden tables (starting with ``~``) such as ``~lineage`` + and job tables (``~~``). - :return: A list of table names from the database schema. + Returns + ------- + list[str] + Table names in topological order. """ self.connection.dependencies.load() return [ @@ -477,31 +632,59 @@ def list_tables(self): class VirtualModule(types.ModuleType): """ - A virtual module imitates a Python module representing a DataJoint schema from table definitions in the database. - It declares the schema objects and a class for each table. + A virtual module representing a DataJoint schema from database tables. + + Creates a Python module with table classes automatically generated from + the database schema. Useful for accessing schemas without Python source. + + Parameters + ---------- + module_name : str + Display name for the module. + schema_name : str + Database schema name. + create_schema : bool, optional + If True, create the schema if it doesn't exist. Default False. + create_tables : bool, optional + If True, allow declaring new tables. Default False. + connection : Connection, optional + Database connection. Defaults to ``dj.conn()``. + add_objects : dict, optional + Additional objects to add to the module namespace. + + Examples + -------- + >>> lab = dj.VirtualModule('lab', 'my_lab_schema') + >>> lab.Subject.fetch() """ def __init__( self, - module_name, - schema_name, + module_name: str, + schema_name: str, *, - create_schema=False, - create_tables=False, - connection=None, - add_objects=None, - ): - """ - Creates a python module with the given name from the name of a schema on the server and - automatically adds classes to it corresponding to the tables in the schema. - - :param module_name: displayed module name - :param schema_name: name of the database in mysql - :param create_schema: if True, create the schema on the database server - :param create_tables: if True, module.schema can be used as the decorator for declaring new - :param connection: a dj.Connection object to pass into the schema - :param add_objects: additional objects to add to the module - :return: the python module containing classes from the schema object and the table classes + create_schema: bool = False, + create_tables: bool = False, + connection: Connection | None = None, + add_objects: dict[str, Any] | None = None, + ) -> None: + """ + Initialize the virtual module. + + Parameters + ---------- + module_name : str + Display name for the module. + schema_name : str + Database schema name. + create_schema : bool, optional + If True, create the schema if it doesn't exist. Default False. + create_tables : bool, optional + If True, allow declaring new tables. Default False. + connection : Connection, optional + Database connection. Defaults to ``dj.conn()``. + add_objects : dict, optional + Additional objects to add to the module namespace. """ super(VirtualModule, self).__init__(name=module_name) _schema = Schema( @@ -516,10 +699,19 @@ def __init__( _schema.spawn_missing_classes(context=self.__dict__) -def list_schemas(connection=None): +def list_schemas(connection: Connection | None = None) -> list[str]: """ - :param connection: a dj.Connection object - :return: list of all accessible schemas on the server + List all accessible schemas on the server. + + Parameters + ---------- + connection : Connection, optional + Database connection. Defaults to ``dj.conn()``. + + Returns + ------- + list[str] + Names of all accessible schemas. """ return [ r[0] diff --git a/src/datajoint/table.py b/src/datajoint/table.py index ac8e8211f..d5481a8c2 100644 --- a/src/datajoint/table.py +++ b/src/datajoint/table.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import collections import csv import inspect @@ -9,11 +11,17 @@ import warnings from dataclasses import dataclass, field from pathlib import Path +from typing import TYPE_CHECKING, Any import numpy as np import pandas from .condition import make_condition + +if TYPE_CHECKING: + from collections.abc import Iterable, Mapping + from .connection import Connection + from .expression import QueryExpression from .declare import alter, declare from .errors import ( AccessError, @@ -109,7 +117,7 @@ class Table(QueryExpression): declaration_context = None @property - def table_name(self): + def table_name(self) -> str: # For UserTable subclasses, table_name is computed by the metaclass. # Delegate to the class's table_name if _table_name is not set. if self._table_name is None: @@ -117,19 +125,22 @@ def table_name(self): return self._table_name @property - def class_name(self): + def class_name(self) -> str: return self.__class__.__name__ @property - def definition(self): + def definition(self) -> str: raise NotImplementedError("Subclasses of Table must implement the `definition` property") - def declare(self, context=None): + def declare(self, context: dict[str, Any] | None = None) -> None: """ Declare the table in the schema based on self.definition. - :param context: the context for foreign key resolution. If None, foreign keys are - not allowed. + Parameters + ---------- + context : dict, optional + Namespace for foreign key resolution. If None, foreign keys are + not allowed in the definition. """ if self.connection.in_transaction: raise DataJointError("Cannot declare new tables inside a transaction, e.g. from inside a populate/make call") @@ -154,19 +165,23 @@ def declare(self, context=None): # Populate lineage table for this table's attributes self._populate_lineage(primary_key, fk_attribute_map) - def _declare_check(self, primary_key, fk_attribute_map): + def _declare_check(self, primary_key: list[str], fk_attribute_map: dict[str, tuple[str, str]]) -> None: """ Hook for declaration-time validation. Subclasses can override. Called before the table is created in the database. Override this method to add validation logic (e.g., AutoPopulate validates FK-only primary keys). - :param primary_key: list of primary key attribute names - :param fk_attribute_map: dict mapping child_attr -> (parent_table, parent_attr) + Parameters + ---------- + primary_key : list + List of primary key attribute names. + fk_attribute_map : dict + Mapping of child_attr -> (parent_table, parent_attr). """ pass # Default: no validation - def _populate_lineage(self, primary_key, fk_attribute_map): + def _populate_lineage(self, primary_key: list[str], fk_attribute_map: dict[str, tuple[str, str]]) -> None: """ Populate the ~lineage table with lineage information for this table's attributes. @@ -174,8 +189,12 @@ def _populate_lineage(self, primary_key, fk_attribute_map): - All FK attributes (traced to their origin) - Native primary key attributes (lineage = self) - :param primary_key: list of primary key attribute names - :param fk_attribute_map: dict mapping child_attr -> (parent_table, parent_attr) + Parameters + ---------- + primary_key : list + List of primary key attribute names. + fk_attribute_map : dict + Mapping of child_attr -> (parent_table, parent_attr). """ from .lineage import ( ensure_lineage_table, @@ -257,28 +276,49 @@ def alter(self, prompt=True, context=None): if prompt: logger.info("Table altered") - def from_clause(self): + def from_clause(self) -> str: """ - :return: the FROM clause of SQL SELECT statements. + Return the FROM clause for SQL SELECT statements. + + Returns + ------- + str + The full table name in backtick-quoted format. """ return self.full_table_name - def get_select_fields(self, select_fields=None): + def get_select_fields(self, select_fields: list[str] | None = None) -> str: """ - :return: the selected attributes from the SQL SELECT statement. + Return the selected attributes for SQL SELECT statements. + + Returns + ------- + str + SQL field list or "*" if no projection is specified. """ return "*" if select_fields is None else self.heading.project(select_fields).as_sql - def parents(self, primary=None, as_objects=False, foreign_key_info=False): + def parents(self, primary: bool | None = None, as_objects: bool = False, foreign_key_info: bool = False) -> list: """ - - :param primary: if None, then all parents are returned. If True, then only foreign keys composed of - primary key attributes are considered. If False, return foreign keys including at least one - secondary attribute. - :param as_objects: if False, return table names. If True, return table objects. - :param foreign_key_info: if True, each element in result also includes foreign key info. - :return: list of parents as table names or table objects - with (optional) foreign key information. + Return the list of parent tables referenced by foreign keys. + + Parameters + ---------- + primary : bool, optional + If None, return all parents. If True, only return foreign keys + composed entirely of primary key attributes. If False, return + foreign keys including at least one secondary attribute. + as_objects : bool, optional + If False (default), return table names as strings. + If True, return FreeTable objects. + foreign_key_info : bool, optional + If True, return tuples of (table, fk_properties). + If False (default), return only table names/objects. + + Returns + ------- + list + Parent tables as names, objects, or (table, fk_info) tuples. """ get_edge = self.connection.dependencies.parents nodes = [ @@ -291,15 +331,27 @@ def parents(self, primary=None, as_objects=False, foreign_key_info=False): nodes = [name for name, props in nodes] return nodes - def children(self, primary=None, as_objects=False, foreign_key_info=False): + def children(self, primary: bool | None = None, as_objects: bool = False, foreign_key_info: bool = False) -> list: """ - :param primary: if None, then all children are returned. If True, then only foreign keys composed of - primary key attributes are considered. If False, return foreign keys including at least one - secondary attribute. - :param as_objects: if False, return table names. If True, return table objects. - :param foreign_key_info: if True, each element in result also includes foreign key info. - :return: list of children as table names or table objects - with (optional) foreign key information. + Return the list of child tables that reference this table via foreign keys. + + Parameters + ---------- + primary : bool, optional + If None, return all children. If True, only return foreign keys + composed entirely of primary key attributes. If False, return + foreign keys including at least one secondary attribute. + as_objects : bool, optional + If False (default), return table names as strings. + If True, return FreeTable objects. + foreign_key_info : bool, optional + If True, return tuples of (table, fk_properties). + If False (default), return only table names/objects. + + Returns + ------- + list + Child tables as names, objects, or (table, fk_info) tuples. """ get_edge = self.connection.dependencies.children nodes = [ @@ -312,10 +364,23 @@ def children(self, primary=None, as_objects=False, foreign_key_info=False): nodes = [name for name, props in nodes] return nodes - def descendants(self, as_objects=False): + def descendants(self, as_objects: bool = False) -> list: """ - :param as_objects: False - a list of table names; True - a list of table objects. - :return: list of tables descendants in topological order. + Return all descendant tables in topological order. + + Descendants are tables that directly or indirectly depend on this table + through foreign key relationships. + + Parameters + ---------- + as_objects : bool, optional + If False (default), return table names as strings. + If True, return FreeTable objects. + + Returns + ------- + list + Descendant tables in topological order (dependencies first). """ return [ FreeTable(self.connection, node) if as_objects else node @@ -323,10 +388,23 @@ def descendants(self, as_objects=False): if not node.isdigit() ] - def ancestors(self, as_objects=False): + def ancestors(self, as_objects: bool = False) -> list: """ - :param as_objects: False - a list of table names; True - a list of table objects. - :return: list of tables ancestors in topological order. + Return all ancestor tables in topological order. + + Ancestors are tables that this table directly or indirectly depends on + through foreign key relationships. + + Parameters + ---------- + as_objects : bool, optional + If False (default), return table names as strings. + If True, return FreeTable objects. + + Returns + ------- + list + Ancestor tables in topological order (dependencies first). """ return [ FreeTable(self.connection, node) if as_objects else node @@ -334,11 +412,23 @@ def ancestors(self, as_objects=False): if not node.isdigit() ] - def parts(self, as_objects=False): + def parts(self, as_objects: bool = False) -> list: """ - return part tables either as entries in a dict with foreign key information or a list of objects + Return part tables belonging to this master table. + + Part tables are subordinate tables whose names follow the pattern + ``master__part`` (double underscore). + + Parameters + ---------- + as_objects : bool, optional + If False (default), return table names as strings. + If True, return FreeTable objects. - :param as_objects: if False (default), the output is a dict describing the foreign keys. If True, return table objects. + Returns + ------- + list + Part table names or FreeTable objects. """ self.connection.dependencies.load(force=False) nodes = [ @@ -349,9 +439,14 @@ def parts(self, as_objects=False): return [FreeTable(self.connection, c) for c in nodes] if as_objects else nodes @property - def is_declared(self): + def is_declared(self) -> bool: """ - :return: True is the table is declared in the schema. + Check if the table is declared in the database. + + Returns + ------- + bool + True if the table exists in the schema. """ return ( self.connection.query( @@ -361,9 +456,14 @@ def is_declared(self): ) @property - def full_table_name(self): + def full_table_name(self) -> str: """ - :return: full table name in the schema + Return the fully qualified table name. + + Returns + ------- + str + Table name in the format ```database`.`table_name```. """ if self.database is None or self.table_name is None: raise DataJointError( @@ -372,23 +472,31 @@ def full_table_name(self): ) return r"`{0:s}`.`{1:s}`".format(self.database, self.table_name) - def update1(self, row): + def update1(self, row: Mapping[str, Any]) -> None: """ - ``update1`` updates one existing entry in the table. - Caution: In DataJoint the primary modes for data manipulation is to ``insert`` and - ``delete`` entire records since referential integrity works on the level of records, - not fields. Therefore, updates are reserved for corrective operations outside of main - workflow. Use UPDATE methods sparingly with full awareness of potential violations of - assumptions. - - :param row: a ``dict`` containing the primary key values and the attributes to update. - Setting an attribute value to None will reset it to the default value (if any). - - The primary key attributes must always be provided. - - Examples: - - >>> table.update1({'id': 1, 'value': 3}) # update value in record with id=1 + Update one existing row in the table. + + Caution: In DataJoint, the primary modes for data manipulation are + ``insert`` and ``delete`` of entire rows since referential integrity + operates at the row level, not field level. Use updates sparingly + for corrective operations outside the main workflow. + + Parameters + ---------- + row : dict + Dictionary containing primary key values and attributes to update. + All primary key attributes must be provided. Setting an attribute + to None resets it to its default value (if any). + + Raises + ------ + DataJointError + If row is not dict-like, primary key is incomplete, attribute + not found, table is restricted, or row doesn't exist. + + Examples + -------- + >>> table.update1({'id': 1, 'value': 3}) # update value in row with id=1 >>> table.update1({'id': 1, 'value': None}) # reset value to default """ # argument validations @@ -414,15 +522,26 @@ def update1(self, row): ) self.connection.query(query, args=list(r[2] for r in row if r[2] is not None)) - def validate(self, rows, *, ignore_extra_fields=False) -> ValidationResult: + def validate(self, rows: Iterable | pandas.DataFrame, *, ignore_extra_fields: bool = False) -> ValidationResult: """ Validate rows without inserting them. - :param rows: Same format as insert() - iterable of dicts, tuples, numpy records, + Parameters + ---------- + rows : iterable or DataFrame + Same format as ``insert()``: iterable of dicts, tuples, numpy records, or a pandas DataFrame. - :param ignore_extra_fields: If True, ignore fields not in the table heading. - :return: ValidationResult with is_valid, errors list, and rows_checked count. + ignore_extra_fields : bool, optional + If True, ignore fields not in the table heading. Default False. + + Returns + ------- + ValidationResult + Result object with ``is_valid``, ``errors`` list, and ``rows_checked``. + Can be used in boolean context (``if result: ...``). + Notes + ----- Validates: - Field existence (all fields must be in table heading) - Row format (correct number of attributes for positional inserts) @@ -436,13 +555,13 @@ def validate(self, rows, *, ignore_extra_fields=False) -> ValidationResult: - Unique constraints (other than PK) - Custom MySQL constraints - Example:: - - result = table.validate(rows) - if result: - table.insert(rows) - else: - print(result.summary()) + Examples + -------- + >>> result = table.validate(rows) + >>> if result: + ... table.insert(rows) + ... else: + ... print(result.summary()) """ errors = [] @@ -551,12 +670,21 @@ def validate(self, rows, *, ignore_extra_fields=False) -> ValidationResult: return ValidationResult(is_valid=len(errors) == 0, errors=errors, rows_checked=row_count) - def insert1(self, row, **kwargs): + def insert1(self, row: Mapping[str, Any] | tuple | np.void, **kwargs: Any) -> None: """ - Insert one data record into the table. For ``kwargs``, see ``insert()``. - - :param row: a numpy record, a dict-like object, or an ordered sequence to be inserted - as one row. + Insert a single row into the table. + + Parameters + ---------- + row : dict or numpy.void or sequence + A single row to insert: dict-like, numpy record, or ordered sequence. + **kwargs + Passed to ``insert()``: replace, skip_duplicates, ignore_extra_fields, + allow_direct_insert. + + See Also + -------- + insert : Insert multiple rows. """ self.insert((row,), **kwargs) @@ -565,27 +693,31 @@ def staged_insert1(self): """ Context manager for staged insert with direct object storage writes. - Use this for large objects like Zarr arrays where copying from local storage - is inefficient. Allows writing directly to the destination storage before - finalizing the database insert. - - Example: - with table.staged_insert1 as staged: - staged.rec['subject_id'] = 123 - staged.rec['session_id'] = 45 - - # Create object storage directly - z = zarr.open(staged.store('raw_data', '.zarr'), mode='w', shape=(1000, 1000)) - z[:] = data - - # Assign to record - staged.rec['raw_data'] = z - - # On successful exit: metadata computed, record inserted - # On exception: storage cleaned up, no record inserted - - Yields: - StagedInsert: Context for setting record values and getting storage handles + Use this for large objects like Zarr arrays where copying from local + storage is inefficient. Allows writing directly to the destination + storage before finalizing the database insert. + + Yields + ------ + StagedInsert + Context for setting record values and getting storage handles. + + Examples + -------- + >>> with table.staged_insert1 as staged: + ... staged.rec['subject_id'] = 123 + ... staged.rec['session_id'] = 45 + ... + ... # Create object storage directly + ... z = zarr.open(staged.store('raw_data', '.zarr'), mode='w', + ... shape=(1000, 1000)) + ... z[:] = data + ... + ... # Assign to record + ... staged.rec['raw_data'] = z + ... + ... # On successful exit: metadata computed, record inserted + ... # On exception: storage cleaned up, no record inserted """ return _staged_insert1(self) @@ -599,29 +731,50 @@ def insert( chunk_size=None, ): """ - Insert a collection of rows. - - :param rows: Either (a) an iterable where an element is a numpy record, a - dict-like object, a pandas.DataFrame, a sequence, or a query expression with - the same heading as self, or (b) a pathlib.Path object specifying a path - relative to the current directory with a CSV file, the contents of which - will be inserted. - :param replace: If True, replaces the existing tuple. - :param skip_duplicates: If True, silently skip duplicate inserts. - :param ignore_extra_fields: If False, fields that are not in the heading raise error. - :param allow_direct_insert: Only applies in auto-populated tables. If False (default), - insert may only be called from inside the make callback. - :param chunk_size: If set, insert rows in batches of this size. Useful for very - large inserts to avoid memory issues. Each chunk is a separate transaction. - - Example: - - >>> Table.insert([ - >>> dict(subject_id=7, species="mouse", date_of_birth="2014-09-01"), - >>> dict(subject_id=8, species="mouse", date_of_birth="2014-09-02")]) - - # Large insert with chunking - >>> Table.insert(large_dataset, chunk_size=10000) + Insert one or more rows into the table. + + Parameters + ---------- + rows : iterable or Path or QueryExpression + Data to insert. Can be: + + - iterable of dicts: ``[{"attr": value, ...}, ...]`` + - iterable of numpy.void: Records from a structured array + - pandas.DataFrame: Each row becomes a table row + - QueryExpression: Results of a query (insert from select) + - pathlib.Path: Path to a CSV file + + replace : bool, optional + If True, replace existing rows with matching primary keys using + MySQL's REPLACE statement. Default False. + skip_duplicates : bool, optional + If True, silently skip rows that would cause duplicate key errors. + Default False. + ignore_extra_fields : bool, optional + If True, ignore fields in the input that are not in the table + heading. If False (default), extra fields raise an error. + allow_direct_insert : bool, optional + For auto-populated tables: if False (default), inserts are only + allowed from within the ``make()`` method. Set True to bypass. + chunk_size : int, optional + If set, insert rows in batches of this size. Useful for very + large inserts to avoid memory issues. Each chunk is a separate + transaction. + + Examples + -------- + >>> Table.insert([ + ... {'subject_id': 7, 'species': 'mouse', 'dob': '2014-09-01'}, + ... {'subject_id': 8, 'species': 'mouse', 'dob': '2014-09-02'} + ... ]) + + >>> # Large insert with chunking + >>> Table.insert(large_dataset, chunk_size=10000) + + See Also + -------- + insert1 : Insert a single row. + insert_dataframe : Insert DataFrame with index handling. """ if isinstance(rows, pandas.DataFrame): # drop 'extra' synthetic index for 1-field index case - @@ -686,10 +839,16 @@ def _insert_rows(self, rows, replace, skip_duplicates, ignore_extra_fields): """ Internal helper to insert a batch of rows. - :param rows: Iterable of rows to insert - :param replace: If True, use REPLACE instead of INSERT - :param skip_duplicates: If True, use ON DUPLICATE KEY UPDATE - :param ignore_extra_fields: If True, ignore unknown fields + Parameters + ---------- + rows : iterable + Rows to insert. + replace : bool + If True, use REPLACE instead of INSERT. + skip_duplicates : bool + If True, use ON DUPLICATE KEY UPDATE. + ignore_extra_fields : bool + If True, ignore unknown fields. """ # collects the field list from first row (passed by reference) field_list = [] @@ -718,30 +877,42 @@ def insert_dataframe(self, df, index_as_pk=None, **insert_kwargs): """ Insert DataFrame with explicit index handling. - This method provides symmetry with to_pandas(): data fetched with to_pandas() - (which sets primary key as index) can be modified and re-inserted using - insert_dataframe() without manual index manipulation. + Provides symmetry with ``to_pandas()``: data fetched with ``to_pandas()`` + (which sets primary key as index) can be modified and re-inserted + without manual index manipulation. - :param df: pandas DataFrame to insert - :param index_as_pk: How to handle DataFrame index: - - None (default): Auto-detect. Use index as primary key if index names - match primary_key columns. Drop if unnamed RangeIndex. - - True: Treat index as primary key columns. Raises if index names don't + Parameters + ---------- + df : pandas.DataFrame + DataFrame to insert. + index_as_pk : bool, optional + How to handle DataFrame index: + + - None (default): Auto-detect. Use index as primary key if index + names match primary_key columns. Drop if unnamed RangeIndex. + - True: Treat index as primary key columns. Raises if names don't match table primary key. - False: Ignore index entirely (drop it). - :param **insert_kwargs: Passed to insert() - replace, skip_duplicates, - ignore_extra_fields, allow_direct_insert, chunk_size - - Example:: - # Round-trip with to_pandas() - df = table.to_pandas() # PK becomes index - df['value'] = df['value'] * 2 # Modify data - table.insert_dataframe(df) # Auto-detects index as PK - - # Explicit control - table.insert_dataframe(df, index_as_pk=True) # Use index - table.insert_dataframe(df, index_as_pk=False) # Ignore index + **insert_kwargs + Passed to ``insert()``: replace, skip_duplicates, + ignore_extra_fields, allow_direct_insert, chunk_size. + + Examples + -------- + >>> # Round-trip with to_pandas() + >>> df = table.to_pandas() # PK becomes index + >>> df['value'] = df['value'] * 2 # Modify data + >>> table.insert_dataframe(df) # Auto-detects index as PK + + >>> # Explicit control + >>> table.insert_dataframe(df, index_as_pk=True) # Use index + >>> table.insert_dataframe(df, index_as_pk=False) # Ignore index + + See Also + -------- + insert : General insert method. + to_pandas : Fetch data as DataFrame with PK as index. """ if not isinstance(df, pandas.DataFrame): raise DataJointError("insert_dataframe requires a pandas DataFrame") @@ -789,10 +960,22 @@ def _validate_index_columns(self, df): f"Use index_as_pk=False to ignore index, or reset_index() first." ) - def delete_quick(self, get_count=False): + def delete_quick(self, get_count: bool = False) -> int | None: """ - Deletes the table without cascading and without user prompt. - If this table has populated dependent tables, this will fail. + Delete rows without cascading or user prompt. + + If this table has populated dependent tables, the delete will fail + due to foreign key constraints. + + Parameters + ---------- + get_count : bool, optional + If True, return the number of deleted rows. Default False. + + Returns + ------- + int or None + Number of deleted rows if get_count=True, else None. """ query = "DELETE FROM " + self.full_table_name + self.where_clause() self.connection.query(query) @@ -807,25 +990,35 @@ def delete( force_masters: bool = False, ) -> int: """ - Deletes the contents of the table and its dependent tables, recursively. - - Args: - transaction: If `True`, use of the entire delete becomes an atomic transaction. - This is the default and recommended behavior. Set to `False` if this delete is - nested within another transaction. - safemode: If `True`, prohibit nested transactions and prompt to confirm. Default - is `dj.config['safemode']`. - force_parts: Delete from parts even when not deleting from their masters. - force_masters: If `True`, include part/master pairs in the cascade. - Default is `False`. - - Returns: - Number of deleted rows (excluding those from dependent tables). - - Raises: - DataJointError: Delete exceeds maximum number of delete attempts. - DataJointError: When deleting within an existing transaction. - DataJointError: Deleting a part table before its master. + Delete rows and cascade to dependent tables. + + Recursively deletes matching rows from this table and all tables + that reference it through foreign keys. + + Parameters + ---------- + transaction : bool, optional + If True (default), the entire delete is an atomic transaction. + Set False if nested within another transaction. + safemode : bool, optional + If True, prohibit nested transactions and prompt for confirmation. + Default is ``dj.config['safemode']``. + force_parts : bool, optional + Delete from part tables even when not deleting from their masters. + Default False. + force_masters : bool, optional + Include part/master pairs in the cascade. Default False. + + Returns + ------- + int + Number of deleted rows (excluding dependent tables). + + Raises + ------ + DataJointError + If delete exceeds maximum attempts, if deleting within an existing + transaction (in safemode), or if deleting part before master. """ deleted = set() visited_masters = set() @@ -965,9 +1158,12 @@ def cascade(table): logger.warning("Delete cancelled") return delete_count - def drop_quick(self): + def drop_quick(self) -> None: """ - Drops the table without cascading to dependent tables and without user prompt. + Drop the table without cascading or user prompt. + + Unlike ``drop()``, this does not cascade to dependent tables and will + fail if other tables have foreign key references to this table. """ if self.is_declared: # Clean up lineage entries for this table @@ -981,10 +1177,18 @@ def drop_quick(self): else: logger.info("Nothing to drop: table %s is not declared" % self.full_table_name) - def drop(self): + def drop(self) -> None: """ - Drop the table and all tables that reference it, recursively. - User is prompted for confirmation if config['safemode'] is set to True. + Drop the table and all dependent tables recursively. + + Cascades to all tables that reference this table through foreign keys. + User is prompted for confirmation if ``config['safemode']`` is True. + + Raises + ------ + DataJointError + If called on a restricted table or attempting to drop a part + table before its master. """ if self.restriction: raise DataJointError( @@ -1014,9 +1218,14 @@ def drop(self): logger.info("Tables dropped. Restart kernel.") @property - def size_on_disk(self): + def size_on_disk(self) -> int: """ - :return: size of data and indices in bytes on the storage device + Return the size of data and indices on disk. + + Returns + ------- + int + Size in bytes (data + indices). """ ret = self.connection.query( 'SHOW TABLE STATUS FROM `{database}` WHERE NAME="{table}"'.format(database=self.database, table=self.table_name), @@ -1024,9 +1233,22 @@ def size_on_disk(self): ).fetchone() return ret["Data_length"] + ret["Index_length"] - def describe(self, context=None, printout=False): + def describe(self, context: dict[str, Any] | None = None, printout: bool = False) -> str: """ - :return: the definition string for the query using DataJoint DDL. + Return the table definition in DataJoint DDL syntax. + + Parameters + ---------- + context : dict, optional + Namespace for resolving foreign key class names. If None, uses + the caller's namespace. + printout : bool, optional + If True, also log the definition. Default False. + + Returns + ------- + str + Table definition in DataJoint DDL format. """ if context is None: frame = inspect.currentframe().f_back @@ -1242,12 +1464,21 @@ def check_fields(fields): def lookup_class_name(name, context, depth=3): """ - given a table name in the form `schema_name`.`table_name`, find its class in the context. - - :param name: `schema_name`.`table_name` - :param context: dictionary representing the namespace - :param depth: search depth into imported modules, helps avoid infinite recursion. - :return: class name found in the context or None if not found + Find a table class in the given namespace by its full table name. + + Parameters + ---------- + name : str + Full table name in format ```schema_name`.`table_name```. + context : dict + Namespace to search (e.g., globals()). + depth : int, optional + Search depth into imported modules. Default 3. + + Returns + ------- + str or None + Class name if found (e.g., "module.ClassName"), None otherwise. """ # breadth-first search nodes = [dict(context=context, context_name="", depth=depth)] @@ -1283,11 +1514,17 @@ def lookup_class_name(name, context, depth=3): class FreeTable(Table): """ - A base table without a dedicated class. Each instance is associated with a table - specified by full_table_name. + A table without a dedicated Python class. + + FreeTable provides access to any table by its full name, useful for + introspection and cascading operations. - :param conn: a dj.Connection object - :param full_table_name: in format `database`.`table_name` + Parameters + ---------- + conn : dj.Connection + Database connection. + full_table_name : str + Table name in format ```database`.`table_name```. """ def __init__(self, conn, full_table_name): diff --git a/src/datajoint/types.py b/src/datajoint/types.py new file mode 100644 index 000000000..72cefee3c --- /dev/null +++ b/src/datajoint/types.py @@ -0,0 +1,60 @@ +""" +Type definitions for DataJoint. + +This module defines type aliases used throughout the DataJoint codebase +to improve code clarity and enable better static type checking. + +Python 3.10+ is required. +""" + +from __future__ import annotations + +from typing import Any, TypeAlias + +# Primary key types +PrimaryKey: TypeAlias = dict[str, Any] +"""A dictionary mapping attribute names to values that uniquely identify an entity.""" + +PrimaryKeyList: TypeAlias = list[dict[str, Any]] +"""A list of primary key dictionaries.""" + +# Row/record types +Row: TypeAlias = dict[str, Any] +"""A single row/record as a dictionary mapping attribute names to values.""" + +RowList: TypeAlias = list[dict[str, Any]] +"""A list of rows/records.""" + +# Attribute types +AttributeName: TypeAlias = str +"""Name of a table attribute/column.""" + +AttributeNames: TypeAlias = list[str] +"""List of attribute/column names.""" + +# Table and schema names +TableName: TypeAlias = str +"""Simple table name (e.g., 'session').""" + +FullTableName: TypeAlias = str +"""Fully qualified table name (e.g., '`schema`.`table`').""" + +SchemaName: TypeAlias = str +"""Database schema name.""" + +# Foreign key mapping +ForeignKeyMap: TypeAlias = dict[str, tuple[str, str]] +"""Mapping of child_attr -> (parent_table, parent_attr) for foreign keys.""" + +# Restriction types +Restriction: TypeAlias = str | dict[str, Any] | bool | "QueryExpression" | list | None +"""Valid restriction types for query operations.""" + +# Fetch result types +FetchResult: TypeAlias = list[dict[str, Any]] +"""Result of a fetch operation as list of dictionaries.""" + + +# For avoiding circular imports +if False: # TYPE_CHECKING equivalent that's always False + from .expression import QueryExpression From 96e2edf166e2e70638c1591c2d3e5f0cce496e87 Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Sun, 4 Jan 2026 12:19:44 -0600 Subject: [PATCH 02/15] docs: Continue docstring harmonization to NumPy style (batch 2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Harmonized modules: - declare.py - Table definition parsing - blob.py - Binary serialization - storage.py - fsspec storage backend - codecs.py - Codec type system - dependencies.py - FK dependency graph - diagram.py - Schema diagram visualization Also includes formatting fixes from pre-commit hooks for expression.py and other modules with long function signatures. All modules now use: - NumPy-style docstrings (Parameters, Returns, Raises, etc.) - `from __future__ import annotations` for deferred evaluation - Python 3.10+ type hints (X | None, list[str]) πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- src/datajoint/blob.py | 134 ++++++++++-- src/datajoint/codecs.py | 274 ++++++++++++++---------- src/datajoint/condition.py | 2 +- src/datajoint/connection.py | 2 +- src/datajoint/declare.py | 312 ++++++++++++++++++++++----- src/datajoint/dependencies.py | 176 ++++++++++++---- src/datajoint/diagram.py | 206 ++++++++++++------ src/datajoint/expression.py | 59 +++++- src/datajoint/heading.py | 2 +- src/datajoint/schemas.py | 1 - src/datajoint/storage.py | 386 +++++++++++++++++++++++----------- src/datajoint/table.py | 1 - 12 files changed, 1138 insertions(+), 417 deletions(-) diff --git a/src/datajoint/blob.py b/src/datajoint/blob.py index 2ac0e62cd..8651a57af 100644 --- a/src/datajoint/blob.py +++ b/src/datajoint/blob.py @@ -1,8 +1,13 @@ """ -(De)serialization methods for basic datatypes and numpy.ndarrays with provisions for mutual -compatibility with Matlab-based serialization implemented by mYm. +Binary serialization for DataJoint blob storage. + +Provides (de)serialization for Python/NumPy objects with backward compatibility +for MATLAB mYm-format blobs. Supports arrays, scalars, structs, cells, and +Python built-in types (dict, list, tuple, set, datetime, UUID, Decimal). """ +from __future__ import annotations + import collections import datetime import uuid @@ -69,31 +74,74 @@ def len_u32(obj): class MatCell(np.ndarray): - """a numpy ndarray representing a Matlab cell array""" + """ + NumPy ndarray subclass representing a MATLAB cell array. + + Used to distinguish cell arrays from regular arrays during serialization + for MATLAB compatibility. + """ pass class MatStruct(np.recarray): - """numpy.recarray representing a Matlab struct array""" + """ + NumPy recarray subclass representing a MATLAB struct array. + + Used to distinguish struct arrays from regular recarrays during + serialization for MATLAB compatibility. + """ pass class Blob: - def __init__(self, squeeze=False): + """ + Binary serializer/deserializer for DataJoint blob storage. + + Handles packing Python objects into binary format and unpacking binary + data back to Python objects. Supports two protocols: + + - ``mYm``: Original MATLAB-compatible format (default) + - ``dj0``: Extended format for Python-specific types + + Parameters + ---------- + squeeze : bool, optional + If True, remove singleton dimensions from arrays and convert + 0-dimensional arrays to scalars. Default False. + + Attributes + ---------- + protocol : bytes or None + Current serialization protocol (``b"mYm\\0"`` or ``b"dj0\\0"``). + """ + + def __init__(self, squeeze: bool = False) -> None: self._squeeze = squeeze self._blob = None self._pos = 0 self.protocol = None - def set_dj0(self): + def set_dj0(self) -> None: + """Switch to dj0 protocol for extended type support.""" self.protocol = b"dj0\0" # when using new blob features - def squeeze(self, array, convert_to_scalar=True): + def squeeze(self, array: np.ndarray, convert_to_scalar: bool = True) -> np.ndarray: """ - Simplify the input array - squeeze out all singleton dimensions. - If convert_to_scalar, then convert zero-dimensional arrays to scalars + Remove singleton dimensions from an array. + + Parameters + ---------- + array : np.ndarray + Input array. + convert_to_scalar : bool, optional + If True, convert 0-dimensional arrays to Python scalars. Default True. + + Returns + ------- + np.ndarray or scalar + Squeezed array or scalar value. """ if not self._squeeze: return array @@ -233,9 +281,19 @@ def read_array(self): data = data + 1j * self.read_value(dtype, count=n_elem) return self.squeeze(data.reshape(shape, order="F")) - def pack_array(self, array): + def pack_array(self, array: np.ndarray) -> bytes: """ - Serialize an np.ndarray into bytes. Scalars are encoded with ndim=0. + Serialize a NumPy array into bytes. + + Parameters + ---------- + array : np.ndarray + Array to serialize. Scalars are encoded with ndim=0. + + Returns + ------- + bytes + Serialized array data. """ if "datetime64" in array.dtype.name: self.set_dj0() @@ -497,10 +555,60 @@ def pack(self, obj, compress): return blob -def pack(obj, compress=True): +def pack(obj, compress: bool = True) -> bytes: + """ + Serialize a Python object to binary blob format. + + Parameters + ---------- + obj : any + Object to serialize. Supports NumPy arrays, Python scalars, + collections (dict, list, tuple, set), datetime objects, UUID, + Decimal, and MATLAB-compatible MatCell/MatStruct. + compress : bool, optional + If True (default), compress blobs larger than 1000 bytes using zlib. + + Returns + ------- + bytes + Serialized binary data. + + Raises + ------ + DataJointError + If the object type is not supported. + + Examples + -------- + >>> data = np.array([1, 2, 3]) + >>> blob = pack(data) + >>> unpacked = unpack(blob) + """ return Blob().pack(obj, compress=compress) -def unpack(blob, squeeze=False): +def unpack(blob: bytes, squeeze: bool = False): + """ + Deserialize a binary blob to a Python object. + + Parameters + ---------- + blob : bytes + Binary data from ``pack()`` or MATLAB mYm serialization. + squeeze : bool, optional + If True, remove singleton dimensions from arrays. Default False. + + Returns + ------- + any + Deserialized Python object. + + Examples + -------- + >>> blob = pack({'a': 1, 'b': [1, 2, 3]}) + >>> data = unpack(blob) + >>> data['b'] + [1, 2, 3] + """ if blob is not None: return Blob(squeeze=squeeze).unpack(blob) diff --git a/src/datajoint/codecs.py b/src/datajoint/codecs.py index cc592badd..6eca19155 100644 --- a/src/datajoint/codecs.py +++ b/src/datajoint/codecs.py @@ -54,27 +54,30 @@ class Codec(ABC): Requires Python 3.10+. - Attributes: - name: Unique identifier used in ```` syntax. Must be set by subclasses. - - Example: - class GraphCodec(dj.Codec): - name = "graph" - - def get_dtype(self, is_external: bool) -> str: - return "" - - def encode(self, graph, *, key=None, store_name=None): - return {'nodes': list(graph.nodes()), 'edges': list(graph.edges())} - - def decode(self, stored, *, key=None): - import networkx as nx - G = nx.Graph() - G.add_nodes_from(stored['nodes']) - G.add_edges_from(stored['edges']) - return G - - The codec can then be used in table definitions:: + Attributes + ---------- + name : str or None + Unique identifier used in ```` syntax. Must be set by subclasses. + + Examples + -------- + >>> class GraphCodec(dj.Codec): + ... name = "graph" + ... + ... def get_dtype(self, is_external: bool) -> str: + ... return "" + ... + ... def encode(self, graph, *, key=None, store_name=None): + ... return {'nodes': list(graph.nodes()), 'edges': list(graph.edges())} + ... + ... def decode(self, stored, *, key=None): + ... import networkx as nx + ... G = nx.Graph() + ... G.add_nodes_from(stored['nodes']) + ... G.add_edges_from(stored['edges']) + ... return G + + Use in table definitions:: class Connectivity(dj.Manual): definition = ''' @@ -83,7 +86,7 @@ class Connectivity(dj.Manual): graph_data : ''' - To skip auto-registration (for abstract base classes):: + Skip auto-registration for abstract base classes:: class ExternalOnlyCodec(dj.Codec, register=False): '''Abstract base - not registered.''' @@ -120,15 +123,23 @@ def get_dtype(self, is_external: bool) -> str: """ Return the storage dtype for this codec. - Args: - is_external: True if @ modifier present (external storage) - - Returns: - A core type (e.g., "bytes", "json") or another codec (e.g., "") - - Raises: - NotImplementedError: If not overridden by subclass. - DataJointError: If external storage not supported but requested. + Parameters + ---------- + is_external : bool + True if ``@`` modifier present (external storage). + + Returns + ------- + str + A core type (e.g., ``"bytes"``, ``"json"``) or another codec + (e.g., ``""``). + + Raises + ------ + NotImplementedError + If not overridden by subclass. + DataJointError + If external storage not supported but requested. """ raise NotImplementedError(f"Codec <{self.name}> must implement get_dtype()") @@ -137,12 +148,18 @@ def encode(self, value: Any, *, key: dict | None = None, store_name: str | None """ Encode Python value for storage. - Args: - value: The Python object to store. - key: Primary key values as a dict. May be needed for path construction. - store_name: Target store name for external storage. - - Returns: + Parameters + ---------- + value : any + The Python object to store. + key : dict, optional + Primary key values. May be needed for path construction. + store_name : str, optional + Target store name for external storage. + + Returns + ------- + any Value in the format expected by the dtype. """ ... @@ -152,11 +169,16 @@ def decode(self, stored: Any, *, key: dict | None = None) -> Any: """ Decode stored value back to Python. - Args: - stored: Data retrieved from storage. - key: Primary key values as a dict. + Parameters + ---------- + stored : any + Data retrieved from storage. + key : dict, optional + Primary key values. - Returns: + Returns + ------- + any The reconstructed Python object. """ ... @@ -169,12 +191,17 @@ def validate(self, value: Any) -> None: Called automatically before ``encode()`` during INSERT operations. The default implementation accepts any value. - Args: - value: The value to validate. - - Raises: - TypeError: If the value has an incompatible type. - ValueError: If the value fails domain validation. + Parameters + ---------- + value : any + The value to validate. + + Raises + ------ + TypeError + If the value has an incompatible type. + ValueError + If the value fails domain validation. """ pass @@ -186,19 +213,25 @@ def parse_type_spec(spec: str) -> tuple[str, str | None]: """ Parse a type specification into type name and optional store parameter. - Handles formats like: - - "" -> ("blob", None) - - "" -> ("blob", "cold") - - "" -> ("blob", "") # default store - - "blob@cold" -> ("blob", "cold") - - "blob" -> ("blob", None) - - Args: - spec: Type specification string, with or without angle brackets. - - Returns: - Tuple of (type_name, store_name). store_name is None if not specified, - empty string if @ present without name (default store). + Parameters + ---------- + spec : str + Type specification string, with or without angle brackets. + + Returns + ------- + tuple[str, str | None] + ``(type_name, store_name)``. ``store_name`` is None if not specified, + empty string if ``@`` present without name (default store). + + Examples + -------- + >>> parse_type_spec("") + ("blob", None) + >>> parse_type_spec("") + ("blob", "cold") + >>> parse_type_spec("") + ("blob", "") """ # Strip angle brackets spec = spec.strip("<>").strip() @@ -216,11 +249,15 @@ def unregister_codec(name: str) -> None: Primarily useful for testing. Use with caution in production code. - Args: - name: The codec name to unregister. + Parameters + ---------- + name : str + The codec name to unregister. - Raises: - DataJointError: If the codec is not registered. + Raises + ------ + DataJointError + If the codec is not registered. """ name = name.strip("<>") if name not in _codec_registry: @@ -235,15 +272,21 @@ def get_codec(name: str) -> Codec: Looks up the codec in the explicit registry first, then attempts to load from installed packages via entry points. - Args: - name: The codec name, with or without angle brackets. - Store parameters (e.g., "") are stripped. + Parameters + ---------- + name : str + The codec name, with or without angle brackets. + Store parameters (e.g., ``""``) are stripped. - Returns: + Returns + ------- + Codec The registered Codec instance. - Raises: - DataJointError: If the codec is not found. + Raises + ------ + DataJointError + If the codec is not found. """ # Strip angle brackets and store parameter type_name, _ = parse_type_spec(name) @@ -267,7 +310,9 @@ def list_codecs() -> list[str]: """ List all registered codec names. - Returns: + Returns + ------- + list[str] Sorted list of registered codec names. """ _load_entry_points() @@ -278,10 +323,14 @@ def is_codec_registered(name: str) -> bool: """ Check if a codec name is registered. - Args: - name: The codec name to check (store parameters are ignored). + Parameters + ---------- + name : str + The codec name to check (store parameters are ignored). - Returns: + Returns + ------- + bool True if the codec is registered. """ type_name, _ = parse_type_spec(name) @@ -346,31 +395,38 @@ def resolve_dtype( """ Resolve a dtype string, following codec chains. - If dtype references another codec (e.g., ""), recursively + If dtype references another codec (e.g., ``""``), recursively resolves to find the ultimate storage type. Store parameters are propagated through the chain. - Args: - dtype: The dtype string to resolve (e.g., "", "", "bytes"). - seen: Set of already-seen codec names (for cycle detection). - store_name: Store name from outer type specification (propagated inward). - - Returns: - Tuple of (final_storage_type, list_of_codecs_in_chain, resolved_store_name). - The chain is ordered from outermost to innermost codec. - - Raises: - DataJointError: If a circular type reference is detected. - - Examples: - >>> resolve_dtype("") - ("bytes", [BlobCodec], None) - - >>> resolve_dtype("") - ("", [BlobCodec], "cold") # BlobCodec.get_dtype(True) returns "" - - >>> resolve_dtype("bytes") - ("bytes", [], None) + Parameters + ---------- + dtype : str + The dtype string to resolve (e.g., ``""``, ``""``, ``"bytes"``). + seen : set[str], optional + Set of already-seen codec names (for cycle detection). + store_name : str, optional + Store name from outer type specification (propagated inward). + + Returns + ------- + tuple[str, list[Codec], str | None] + ``(final_storage_type, codec_chain, resolved_store_name)``. + Chain is ordered from outermost to innermost codec. + + Raises + ------ + DataJointError + If a circular type reference is detected. + + Examples + -------- + >>> resolve_dtype("") + ("bytes", [BlobCodec], None) + >>> resolve_dtype("") + ("", [BlobCodec], "cold") + >>> resolve_dtype("bytes") + ("bytes", [], None) """ if seen is None: seen = set() @@ -420,18 +476,24 @@ def lookup_codec(codec_spec: str) -> tuple[Codec, str | None]: """ Look up a codec from a type specification string. - Parses a codec specification (e.g., "") and returns + Parses a codec specification (e.g., ``""``) and returns the codec instance along with any store name. - Args: - codec_spec: The codec specification, with or without angle brackets. - May include store parameter (e.g., ""). - - Returns: - Tuple of (Codec instance, store_name or None). - - Raises: - DataJointError: If the codec is not found. + Parameters + ---------- + codec_spec : str + The codec specification, with or without angle brackets. + May include store parameter (e.g., ``""``). + + Returns + ------- + tuple[Codec, str | None] + ``(codec_instance, store_name)`` or ``(codec_instance, None)``. + + Raises + ------ + DataJointError + If the codec is not found. """ type_name, store_name = parse_type_spec(codec_spec) diff --git a/src/datajoint/condition.py b/src/datajoint/condition.py index 750208e5c..8ab19ca5d 100644 --- a/src/datajoint/condition.py +++ b/src/datajoint/condition.py @@ -393,7 +393,7 @@ def combine_conditions(negate, conditions): def extract_column_names(sql_expression: str) -> set[str]: - """ + r""" Extract column names from an SQL expression. Parameters diff --git a/src/datajoint/connection.py b/src/datajoint/connection.py index be5e6183b..394952886 100644 --- a/src/datajoint/connection.py +++ b/src/datajoint/connection.py @@ -11,7 +11,7 @@ import warnings from contextlib import contextmanager from getpass import getpass -from typing import Any, Callable +from typing import Callable import pymysql as client diff --git a/src/datajoint/declare.py b/src/datajoint/declare.py index 77638d4f7..d8479b124 100644 --- a/src/datajoint/declare.py +++ b/src/datajoint/declare.py @@ -1,8 +1,12 @@ """ -This module hosts functions to convert DataJoint table definitions into mysql table definitions, and to -declare the corresponding mysql tables. +Table definition parsing and SQL generation. + +This module converts DataJoint table definitions into MySQL CREATE TABLE +statements, handling type mapping, foreign key resolution, and index creation. """ +from __future__ import annotations + import logging import re @@ -92,8 +96,25 @@ assert SPECIAL_TYPES <= set(TYPE_PATTERN) -def match_type(attribute_type): - """Match an attribute type string to a category.""" +def match_type(attribute_type: str) -> str: + """ + Match an attribute type string to its category. + + Parameters + ---------- + attribute_type : str + The type string from the table definition (e.g., ``"float32"``, ``"varchar(255)"``). + + Returns + ------- + str + Category name from TYPE_PATTERN (e.g., ``"FLOAT32"``, ``"STRING"``, ``"CODEC"``). + + Raises + ------ + DataJointError + If the type string doesn't match any known pattern. + """ try: return next(category for category, pattern in TYPE_PATTERN.items() if pattern.match(attribute_type)) except StopIteration: @@ -103,7 +124,16 @@ def match_type(attribute_type): logger = logging.getLogger(__name__.split(".")[0]) -def build_foreign_key_parser(): +def build_foreign_key_parser() -> pp.ParserElement: + """ + Build a pyparsing parser for foreign key definitions. + + Returns + ------- + pp.ParserElement + Parser that extracts ``options`` and ``ref_table`` from lines like + ``-> [nullable] ParentTable``. + """ arrow = pp.Literal("->").suppress() lbracket = pp.Literal("[").suppress() rbracket = pp.Literal("]").suppress() @@ -113,7 +143,16 @@ def build_foreign_key_parser(): return arrow + options + ref_table -def build_attribute_parser(): +def build_attribute_parser() -> pp.ParserElement: + """ + Build a pyparsing parser for attribute definitions. + + Returns + ------- + pp.ParserElement + Parser that extracts ``name``, ``type``, ``default``, and ``comment`` + from attribute definition lines. + """ quoted = pp.QuotedString('"') ^ pp.QuotedString("'") colon = pp.Literal(":").suppress() attribute_name = pp.Word(pp.srange("[a-z]"), pp.srange("[a-z0-9_]")).set_results_name("name") @@ -130,27 +169,62 @@ def build_attribute_parser(): attribute_parser = build_attribute_parser() -def is_foreign_key(line): +def is_foreign_key(line: str) -> bool: """ - - :param line: a line from the table definition - :return: true if the line appears to be a foreign key definition + Check if a definition line is a foreign key reference. + + Parameters + ---------- + line : str + A line from the table definition. + + Returns + ------- + bool + True if the line appears to be a foreign key definition (contains ``->`` + not inside quotes or comments). """ arrow_position = line.find("->") return arrow_position >= 0 and not any(c in line[:arrow_position] for c in "\"#'") -def compile_foreign_key(line, context, attributes, primary_key, attr_sql, foreign_key_sql, index_sql, fk_attribute_map=None): +def compile_foreign_key( + line: str, + context: dict, + attributes: list[str], + primary_key: list[str] | None, + attr_sql: list[str], + foreign_key_sql: list[str], + index_sql: list[str], + fk_attribute_map: dict[str, tuple[str, str]] | None = None, +) -> None: """ - :param line: a line from a table definition - :param context: namespace containing referenced objects - :param attributes: list of attribute names already in the declaration -- to be updated by this function - :param primary_key: None if the current foreign key is made from the dependent section. Otherwise it is the list - of primary key attributes thus far -- to be updated by the function - :param attr_sql: list of sql statements defining attributes -- to be updated by this function. - :param foreign_key_sql: list of sql statements specifying foreign key constraints -- to be updated by this function. - :param index_sql: list of INDEX declaration statements, duplicate or redundant indexes are ok. - :param fk_attribute_map: dict mapping child attr -> (parent_table, parent_attr) -- to be updated by this function. + Parse a foreign key line and update declaration components. + + Parameters + ---------- + line : str + A foreign key line from the table definition (e.g., ``"-> Parent"``). + context : dict + Namespace containing referenced table objects. + attributes : list[str] + Attribute names already declared. Updated in place with new FK attributes. + primary_key : list[str] or None + Primary key attributes so far. None if in dependent section. + Updated in place with FK attributes when not None. + attr_sql : list[str] + SQL attribute definitions. Updated in place. + foreign_key_sql : list[str] + SQL FOREIGN KEY constraints. Updated in place. + index_sql : list[str] + SQL INDEX declarations. Updated in place. + fk_attribute_map : dict, optional + Mapping of ``child_attr -> (parent_table, parent_attr)``. Updated in place. + + Raises + ------ + DataJointError + If the foreign key reference cannot be resolved or options are invalid. """ # Parse and validate from .expression import QueryExpression @@ -214,7 +288,32 @@ def compile_foreign_key(line, context, attributes, primary_key, attr_sql, foreig index_sql.append("UNIQUE INDEX ({attrs})".format(attrs=",".join("`%s`" % attr for attr in ref.primary_key))) -def prepare_declare(definition, context): +def prepare_declare( + definition: str, context: dict +) -> tuple[str, list[str], list[str], list[str], list[str], list[str], dict[str, tuple[str, str]]]: + """ + Parse a table definition into its components. + + Parameters + ---------- + definition : str + DataJoint table definition string. + context : dict + Namespace for resolving foreign key references. + + Returns + ------- + tuple + Seven-element tuple containing: + + - table_comment : str + - primary_key : list[str] + - attribute_sql : list[str] + - foreign_key_sql : list[str] + - index_sql : list[str] + - external_stores : list[str] + - fk_attribute_map : dict[str, tuple[str, str]] + """ # split definition into lines definition = re.split(r"\s*\n\s*", definition.strip()) # check for optional table comment @@ -269,14 +368,35 @@ def prepare_declare(definition, context): ) -def declare(full_table_name, definition, context): - """ - Parse declaration and generate the SQL CREATE TABLE code - - :param full_table_name: full name of the table - :param definition: DataJoint table definition - :param context: dictionary of objects that might be referred to in the table - :return: SQL CREATE TABLE statement, list of external stores used +def declare( + full_table_name: str, definition: str, context: dict +) -> tuple[str, list[str], list[str], dict[str, tuple[str, str]]]: + r""" + Parse a definition and generate SQL CREATE TABLE statement. + + Parameters + ---------- + full_table_name : str + Fully qualified table name (e.g., ```\`schema\`.\`table\```). + definition : str + DataJoint table definition string. + context : dict + Namespace for resolving foreign key references. + + Returns + ------- + tuple + Four-element tuple: + + - sql : str - SQL CREATE TABLE statement + - external_stores : list[str] - External store names used + - primary_key : list[str] - Primary key attribute names + - fk_attribute_map : dict - FK attribute lineage mapping + + Raises + ------ + DataJointError + If table name exceeds max length or has no primary key. """ table_name = full_table_name.strip("`").split(".")[1] if len(table_name) > MAX_TABLE_NAME_LENGTH: @@ -322,12 +442,28 @@ def declare(full_table_name, definition, context): return sql, external_stores, primary_key, fk_attribute_map -def _make_attribute_alter(new, old, primary_key): +def _make_attribute_alter(new: list[str], old: list[str], primary_key: list[str]) -> list[str]: """ - :param new: new attribute declarations - :param old: old attribute declarations - :param primary_key: primary key attributes - :return: list of SQL ALTER commands + Generate SQL ALTER commands for attribute changes. + + Parameters + ---------- + new : list[str] + New attribute SQL declarations. + old : list[str] + Old attribute SQL declarations. + primary_key : list[str] + Primary key attribute names (cannot be altered). + + Returns + ------- + list[str] + SQL ALTER commands (ADD, MODIFY, CHANGE, DROP). + + Raises + ------ + DataJointError + If an attribute is renamed twice or renamed from non-existent attribute. """ # parse attribute names name_regexp = re.compile(r"^`(?P\w+)`") @@ -391,12 +527,31 @@ def _make_attribute_alter(new, old, primary_key): return sql -def alter(definition, old_definition, context): +def alter(definition: str, old_definition: str, context: dict) -> tuple[list[str], list[str]]: """ - :param definition: new table definition - :param old_definition: current table definition - :param context: the context in which to evaluate foreign key definitions - :return: string SQL ALTER command, list of new stores used for external storage + Generate SQL ALTER commands for table definition changes. + + Parameters + ---------- + definition : str + New table definition. + old_definition : str + Current table definition. + context : dict + Namespace for resolving foreign key references. + + Returns + ------- + tuple + Two-element tuple: + + - sql : list[str] - SQL ALTER commands + - new_stores : list[str] - New external stores used + + Raises + ------ + NotImplementedError + If attempting to alter primary key, foreign keys, or indexes. """ ( table_comment, @@ -432,7 +587,24 @@ def alter(definition, old_definition, context): return sql, [e for e in external_stores if e not in external_stores_] -def compile_index(line, index_sql): +def compile_index(line: str, index_sql: list[str]) -> None: + """ + Parse an index declaration and append SQL to index_sql. + + Parameters + ---------- + line : str + Index declaration line (e.g., ``"index(attr1, attr2)"`` or + ``"unique index(attr)"``). + index_sql : list[str] + List of index SQL declarations. Updated in place. + + Raises + ------ + DataJointError + If the index syntax is invalid. + """ + def format_attribute(attr): match, attr = translate_attribute(attr) if match is None: @@ -455,18 +627,25 @@ def format_attribute(attr): ) -def substitute_special_type(match, category, foreign_key_sql, context): +def substitute_special_type(match: dict, category: str, foreign_key_sql: list[str], context: dict) -> None: """ Substitute special types with their native SQL equivalents. - Special types are: - - Core DataJoint types (float32 β†’ float, uuid β†’ binary(16), bytes β†’ longblob, etc.) - - CODEC types (Codecs in angle brackets) - - :param match: dict containing with keys "type" and "comment" -- will be modified in place - :param category: attribute type category from TYPE_PATTERN - :param foreign_key_sql: list of foreign key declarations to add to - :param context: context for looking up user-defined codecs (unused, kept for compatibility) + Special types include core DataJoint types (``float32`` β†’ ``float``, + ``uuid`` β†’ ``binary(16)``, ``bytes`` β†’ ``longblob``) and codec types + (angle bracket syntax like ````). + + Parameters + ---------- + match : dict + Parsed attribute with keys ``"type"``, ``"comment"``, etc. + Modified in place with substituted type. + category : str + Type category from TYPE_PATTERN (e.g., ``"FLOAT32"``, ``"CODEC"``). + foreign_key_sql : list[str] + Foreign key declarations (unused, kept for API compatibility). + context : dict + Namespace for codec lookup (unused, kept for API compatibility). """ if category == "CODEC": # Codec - resolve to underlying dtype @@ -499,15 +678,34 @@ def substitute_special_type(match, category, foreign_key_sql, context): assert False, f"Unknown special type: {category}" -def compile_attribute(line, in_key, foreign_key_sql, context): +def compile_attribute(line: str, in_key: bool, foreign_key_sql: list[str], context: dict) -> tuple[str, str, str | None]: """ - Convert attribute definition from DataJoint format to SQL - - :param line: attribution line - :param in_key: set to True if attribute is in primary key set - :param foreign_key_sql: the list of foreign key declarations to add to - :param context: context in which to look up user-defined attribute type adapterss - :returns: (name, sql, store) -- attribute name, sql code for its declaration, and optional store name + Convert an attribute definition from DataJoint format to SQL. + + Parameters + ---------- + line : str + Attribute definition line (e.g., ``"session_id : int32 # unique session"``). + in_key : bool + True if the attribute is part of the primary key. + foreign_key_sql : list[str] + Foreign key declarations (passed to type substitution). + context : dict + Namespace for codec lookup. + + Returns + ------- + tuple + Three-element tuple: + + - name : str - Attribute name + - sql : str - SQL column declaration + - store : str or None - External store name if applicable + + Raises + ------ + DataJointError + If syntax is invalid, primary key is nullable, or blob has invalid default. """ try: match = attribute_parser.parse_string(line + "#", parse_all=True) diff --git a/src/datajoint/dependencies.py b/src/datajoint/dependencies.py index a342bf3f0..621011426 100644 --- a/src/datajoint/dependencies.py +++ b/src/datajoint/dependencies.py @@ -1,3 +1,13 @@ +""" +Foreign key dependency graph for DataJoint schemas. + +This module provides the Dependencies class that tracks foreign key +relationships between tables and supports topological sorting for +proper ordering of operations like delete and drop. +""" + +from __future__ import annotations + import itertools import re from collections import defaultdict @@ -7,18 +17,37 @@ from .errors import DataJointError -def extract_master(part_table): - """ - given a part table name, return master part. None if not a part table +def extract_master(part_table: str) -> str | None: + r""" + Extract master table name from a part table name. + + Parameters + ---------- + part_table : str + Full table name (e.g., ```\`schema\`.\`master__part\```). + + Returns + ------- + str or None + Master table name if part_table is a part table, None otherwise. """ match = re.match(r"(?P`\w+`.`#?\w+)__\w+`", part_table) return match["master"] + "`" if match else None -def topo_sort(graph): +def topo_sort(graph: nx.DiGraph) -> list[str]: """ - topological sort of a dependency graph that keeps part tables together with their masters - :return: list of table names in topological order + Topological sort keeping part tables with their masters. + + Parameters + ---------- + graph : nx.DiGraph + Dependency graph. + + Returns + ------- + list[str] + Table names in topological order with parts following masters. """ graph = nx.DiGraph(graph) # make a copy @@ -69,28 +98,52 @@ def topo_sort(graph): class Dependencies(nx.DiGraph): """ - The graph of dependencies (foreign keys) between loaded tables. + Graph of foreign key dependencies between loaded tables. + + Extends NetworkX DiGraph to track foreign key relationships and + support operations like cascade delete and topological ordering. + + Parameters + ---------- + connection : Connection, optional + Database connection. May be None to support NetworkX algorithms + that create objects with empty constructors. + + Attributes + ---------- + _conn : Connection or None + Database connection. + _loaded : bool + Whether dependencies have been loaded from the database. - Note: the 'connection' argument should normally be supplied; - Empty use is permitted to facilitate use of networkx algorithms which - internally create objects with the expectation of empty constructors. - See also: https://github.com/datajoint/datajoint-python/pull/443 + Notes + ----- + Empty constructor use is permitted to facilitate NetworkX algorithms. + See: https://github.com/datajoint/datajoint-python/pull/443 """ - def __init__(self, connection=None): + def __init__(self, connection=None) -> None: self._conn = connection self._node_alias_count = itertools.count() self._loaded = False super().__init__(self) - def clear(self): + def clear(self) -> None: + """Clear the graph and reset loaded state.""" self._loaded = False super().clear() - def load(self, force=True): + def load(self, force: bool = True) -> None: """ Load dependencies for all loaded schemas. - This method gets called before any operation that requires dependencies: delete, drop, populate, progress. + + Called before operations requiring dependencies: delete, drop, + populate, progress. + + Parameters + ---------- + force : bool, optional + If True (default), reload even if already loaded. """ # reload from scratch to prevent duplication of renamed edges if self._loaded and not force: @@ -165,45 +218,90 @@ def load(self, force=True): raise DataJointError("DataJoint can only work with acyclic dependencies") self._loaded = True - def topo_sort(self): - """:return: list of tables names in topological order""" - return topo_sort(self) + def topo_sort(self) -> list[str]: + """ + Return table names in topological order. - def parents(self, table_name, primary=None): + Returns + ------- + list[str] + Table names sorted topologically. """ - :param table_name: `schema`.`table` - :param primary: if None, then all parents are returned. If True, then only foreign keys composed of - primary key attributes are considered. If False, the only foreign keys including at least one non-primary - attribute are considered. - :return: dict of tables referenced by the foreign keys of table + return topo_sort(self) + + def parents(self, table_name: str, primary: bool | None = None) -> dict: + r""" + Get tables referenced by this table's foreign keys. + + Parameters + ---------- + table_name : str + Full table name (```\`schema\`.\`table\```). + primary : bool, optional + If None, return all parents. If True, only FK composed entirely + of primary key attributes. If False, only FK with at least one + non-primary attribute. + + Returns + ------- + dict + Mapping of parent table name to edge properties. """ self.load(force=False) return {p[0]: p[2] for p in self.in_edges(table_name, data=True) if primary is None or p[2]["primary"] == primary} - def children(self, table_name, primary=None): - """ - :param table_name: `schema`.`table` - :param primary: if None, then all children are returned. If True, then only foreign keys composed of - primary key attributes are considered. If False, the only foreign keys including at least one non-primary - attribute are considered. - :return: dict of tables referencing the table through foreign keys + def children(self, table_name: str, primary: bool | None = None) -> dict: + r""" + Get tables that reference this table through foreign keys. + + Parameters + ---------- + table_name : str + Full table name (```\`schema\`.\`table\```). + primary : bool, optional + If None, return all children. If True, only FK composed entirely + of primary key attributes. If False, only FK with at least one + non-primary attribute. + + Returns + ------- + dict + Mapping of child table name to edge properties. """ self.load(force=False) return {p[1]: p[2] for p in self.out_edges(table_name, data=True) if primary is None or p[2]["primary"] == primary} - def descendants(self, full_table_name): - """ - :param full_table_name: In form `schema`.`table_name` - :return: all dependent tables sorted in topological order. Self is included. + def descendants(self, full_table_name: str) -> list[str]: + r""" + Get all dependent tables in topological order. + + Parameters + ---------- + full_table_name : str + Full table name (```\`schema\`.\`table_name\```). + + Returns + ------- + list[str] + Dependent tables in topological order. Self is included first. """ self.load(force=False) nodes = self.subgraph(nx.descendants(self, full_table_name)) return [full_table_name] + nodes.topo_sort() - def ancestors(self, full_table_name): - """ - :param full_table_name: In form `schema`.`table_name` - :return: all dependent tables sorted in topological order. Self is included. + def ancestors(self, full_table_name: str) -> list[str]: + r""" + Get all ancestor tables in reverse topological order. + + Parameters + ---------- + full_table_name : str + Full table name (```\`schema\`.\`table_name\```). + + Returns + ------- + list[str] + Ancestor tables in reverse topological order. Self is included last. """ self.load(force=False) nodes = self.subgraph(nx.ancestors(self, full_table_name)) diff --git a/src/datajoint/diagram.py b/src/datajoint/diagram.py index 3b6061102..de211df8f 100644 --- a/src/datajoint/diagram.py +++ b/src/datajoint/diagram.py @@ -1,3 +1,12 @@ +""" +Diagram visualization for DataJoint schemas. + +This module provides the Diagram class for visualizing schema structure +as directed acyclic graphs showing tables and their foreign key relationships. +""" + +from __future__ import annotations + import functools import inspect import io @@ -32,44 +41,58 @@ class Diagram: """ - Entity relationship diagram, currently disabled due to the lack of required packages: matplotlib and pygraphviz. + Schema diagram (disabled). + + Diagram visualization requires matplotlib and pygraphviz packages. + Install them to enable this feature. - To enable Diagram feature, please install both matplotlib and pygraphviz. For instructions on how to install - these two packages, refer to https://docs.datajoint.com/core/datajoint-python/0.14/client/install/ + See Also + -------- + https://docs.datajoint.com/core/datajoint-python/0.14/client/install/ """ - def __init__(self, *args, **kwargs): + def __init__(self, *args, **kwargs) -> None: logger.warning("Please install matplotlib and pygraphviz libraries to enable the Diagram feature.") else: class Diagram(nx.DiGraph): """ - Schema diagram showing tables and foreign keys between in the form of a directed - acyclic graph (DAG). The diagram is derived from the connection.dependencies object. - - Usage: - - >>> diag = Diagram(source) - - source can be a table object, a table class, a schema, or a module that has a schema. - + Schema diagram as a directed acyclic graph (DAG). + + Visualizes tables and foreign key relationships derived from + ``connection.dependencies``. + + Parameters + ---------- + source : Table, Schema, or module + A table object, table class, schema, or module with a schema. + context : dict, optional + Namespace for resolving table class names. If None, uses caller's + frame globals/locals. + + Examples + -------- + >>> diag = dj.Diagram(schema.MyTable) >>> diag.draw() - draws the diagram using pyplot + Operators: + + - ``diag1 + diag2`` - union of diagrams + - ``diag1 - diag2`` - difference of diagrams + - ``diag1 * diag2`` - intersection of diagrams + - ``diag + n`` - expand n levels of successors (children) + - ``diag - n`` - expand n levels of predecessors (parents) - diag1 + diag2 - combines the two diagrams. - diag1 - diag2 - difference between diagrams - diag1 * diag2 - intersection of diagrams - diag + n - expands n levels of successors - diag - n - expands n levels of predecessors - Thus dj.Diagram(schema.Table)+1-1 defines the diagram of immediate ancestors and descendants of schema.Table + >>> dj.Diagram(schema.Table) + 1 - 1 # immediate ancestors and descendants - Note that diagram + 1 - 1 may differ from diagram - 1 + 1 and so forth. - Only those tables that are loaded in the connection object are displayed + Notes + ----- + ``diagram + 1 - 1`` may differ from ``diagram - 1 + 1``. + Only tables loaded in the connection are displayed. """ - def __init__(self, source, context=None): + def __init__(self, source, context=None) -> None: if isinstance(source, Diagram): # copy constructor self.nodes_to_show = set(source.nodes_to_show) @@ -115,27 +138,33 @@ def __init__(self, source, context=None): self.nodes_to_show.add(node) @classmethod - def from_sequence(cls, sequence): + def from_sequence(cls, sequence) -> "Diagram": """ - The join Diagram for all objects in sequence + Create combined Diagram from a sequence of sources. + + Parameters + ---------- + sequence : iterable + Sequence of table objects, classes, or schemas. - :param sequence: a sequence (e.g. list, tuple) - :return: Diagram(arg1) + ... + Diagram(argn) + Returns + ------- + Diagram + Union of diagrams: ``Diagram(arg1) + ... + Diagram(argn)``. """ return functools.reduce(lambda x, y: x + y, map(Diagram, sequence)) - def add_parts(self): + def add_parts(self) -> "Diagram": """ - Adds to the diagram the part tables of all master tables already in the diagram - :return: + Add part tables of all masters already in the diagram. + + Returns + ------- + Diagram + New diagram with part tables included. """ def is_part(part, master): - """ - :param part: `database`.`table_name` - :param master: `database`.`table_name` - :return: True if part is part of master. - """ part = [s.strip("`") for s in part.split(".")] master = [s.strip("`") for s in master.split(".")] return master[0] == part[0] and master[1] + "__" == part[1][: len(master[1]) + 2] @@ -144,11 +173,19 @@ def is_part(part, master): self.nodes_to_show.update(n for n in self.nodes() if any(is_part(n, m) for m in self.nodes_to_show)) return self - def __add__(self, arg): + def __add__(self, arg) -> "Diagram": """ - :param arg: either another Diagram or a positive integer. - :return: Union of the diagrams when arg is another Diagram - or an expansion downstream when arg is a positive integer. + Union or downstream expansion. + + Parameters + ---------- + arg : Diagram or int + Another Diagram for union, or positive int for downstream expansion. + + Returns + ------- + Diagram + Combined or expanded diagram. """ self = Diagram(self) # copy try: @@ -166,11 +203,19 @@ def __add__(self, arg): self.nodes_to_show.update(new) return self - def __sub__(self, arg): + def __sub__(self, arg) -> "Diagram": """ - :param arg: either another Diagram or a positive integer. - :return: Difference of the diagrams when arg is another Diagram or - an expansion upstream when arg is a positive integer. + Difference or upstream expansion. + + Parameters + ---------- + arg : Diagram or int + Another Diagram for difference, or positive int for upstream expansion. + + Returns + ------- + Diagram + Reduced or expanded diagram. """ self = Diagram(self) # copy try: @@ -189,23 +234,43 @@ def __sub__(self, arg): self.nodes_to_show.update(new) return self - def __mul__(self, arg): + def __mul__(self, arg) -> "Diagram": """ - Intersection of two diagrams - :param arg: another Diagram - :return: a new Diagram comprising nodes that are present in both operands. + Intersection of two diagrams. + + Parameters + ---------- + arg : Diagram + Another Diagram. + + Returns + ------- + Diagram + Diagram with nodes present in both operands. """ self = Diagram(self) # copy self.nodes_to_show.intersection_update(arg.nodes_to_show) return self - def topo_sort(self): - """return nodes in lexicographical topological order""" + def topo_sort(self) -> list[str]: + """ + Return nodes in topological order. + + Returns + ------- + list[str] + Node names in topological order. + """ return topo_sort(self) - def _make_graph(self): + def _make_graph(self) -> nx.DiGraph: """ - Make the self.graph - a graph object ready for drawing + Build graph object ready for drawing. + + Returns + ------- + nx.DiGraph + Graph with nodes relabeled to class names. """ # mark "distinguished" tables, i.e. those that introduce new primary key # attributes @@ -233,13 +298,14 @@ def _make_graph(self): return graph @staticmethod - def _encapsulate_edge_attributes(graph): + def _encapsulate_edge_attributes(graph: nx.DiGraph) -> None: """ - Modifies the `nx.Graph`'s edge attribute `attr_map` to be a string representation - of the attribute map, and encapsulates the string in double quotes. - Changes the graph in place. + Encapsulate edge attr_map in double quotes for pydot compatibility. + + Modifies graph in place. - Implements workaround described in + See Also + -------- https://github.com/pydot/pydot/issues/258#issuecomment-795798099 """ for u, v, *_, edgedata in graph.edges(data=True): @@ -247,13 +313,14 @@ def _encapsulate_edge_attributes(graph): graph.edges[u, v]["attr_map"] = '"{0}"'.format(edgedata["attr_map"]) @staticmethod - def _encapsulate_node_names(graph): + def _encapsulate_node_names(graph: nx.DiGraph) -> None: """ - Modifies the `nx.Graph`'s node names string representations encapsulated in - double quotes. - Changes the graph in place. + Encapsulate node names in double quotes for pydot compatibility. - Implements workaround described in + Modifies graph in place. + + See Also + -------- https://github.com/datajoint/datajoint-python/pull/1176 """ nx.relabel_nodes( @@ -396,7 +463,22 @@ def draw(self): else: raise DataJointError("pyplot was not imported") - def save(self, filename, format=None): + def save(self, filename: str, format: str | None = None) -> None: + """ + Save diagram to file. + + Parameters + ---------- + filename : str + Output filename. + format : str, optional + File format (``'png'`` or ``'svg'``). Inferred from extension if None. + + Raises + ------ + DataJointError + If format is unsupported. + """ if format is None: if filename.lower().endswith(".png"): format = "png" diff --git a/src/datajoint/expression.py b/src/datajoint/expression.py index d469a9e78..ceabf4a8f 100644 --- a/src/datajoint/expression.py +++ b/src/datajoint/expression.py @@ -362,7 +362,9 @@ def __matmul__(self, other): "Use .join(other, semantic_check=False) for joins without semantic checking." ) - def join(self, other: QueryExpression | type, semantic_check: bool = True, left: bool = False, allow_nullable_pk: bool = False) -> QueryExpression: + def join( + self, other: QueryExpression | type, semantic_check: bool = True, left: bool = False, allow_nullable_pk: bool = False + ) -> QueryExpression: """ Join this expression with another. @@ -636,7 +638,9 @@ def proj(self, *attributes: str, **named_attributes: str) -> QueryExpression: ) return result - def aggr(self, group: QueryExpression, *attributes: str, keep_all_rows: bool = False, **named_attributes: str) -> QueryExpression: + def aggr( + self, group: QueryExpression, *attributes: str, keep_all_rows: bool = False, **named_attributes: str + ) -> QueryExpression: """ Aggregate data grouped by this expression's primary key. @@ -707,7 +711,14 @@ def _apply_top(self, order_by=None, limit=None, offset=None): return self.restrict(Top(limit, order_by, offset)) return self - def to_dicts(self, order_by: str | list[str] | None = None, limit: int | None = None, offset: int | None = None, squeeze: bool = False, download_path: str = ".") -> list[dict[str, Any]]: + def to_dicts( + self, + order_by: str | list[str] | None = None, + limit: int | None = None, + offset: int | None = None, + squeeze: bool = False, + download_path: str = ".", + ) -> list[dict[str, Any]]: """ Fetch all rows as a list of dictionaries. @@ -738,7 +749,14 @@ def to_dicts(self, order_by: str | list[str] | None = None, limit: int | None = for row in cursor ] - def to_pandas(self, order_by: str | list[str] | None = None, limit: int | None = None, offset: int | None = None, squeeze: bool = False, download_path: str = ".") -> pandas.DataFrame: + def to_pandas( + self, + order_by: str | list[str] | None = None, + limit: int | None = None, + offset: int | None = None, + squeeze: bool = False, + download_path: str = ".", + ) -> pandas.DataFrame: """ Fetch all rows as a pandas DataFrame with primary key as index. @@ -770,7 +788,14 @@ def to_pandas(self, order_by: str | list[str] | None = None, limit: int | None = df = df.set_index(self.primary_key) return df - def to_polars(self, order_by: str | list[str] | None = None, limit: int | None = None, offset: int | None = None, squeeze: bool = False, download_path: str = "."): + def to_polars( + self, + order_by: str | list[str] | None = None, + limit: int | None = None, + offset: int | None = None, + squeeze: bool = False, + download_path: str = ".", + ): """ Fetch all rows as a polars DataFrame. @@ -804,7 +829,14 @@ def to_polars(self, order_by: str | list[str] | None = None, limit: int | None = dicts = self.to_dicts(order_by=order_by, limit=limit, offset=offset, squeeze=squeeze, download_path=download_path) return polars.DataFrame(dicts) - def to_arrow(self, order_by: str | list[str] | None = None, limit: int | None = None, offset: int | None = None, squeeze: bool = False, download_path: str = "."): + def to_arrow( + self, + order_by: str | list[str] | None = None, + limit: int | None = None, + offset: int | None = None, + squeeze: bool = False, + download_path: str = ".", + ): """ Fetch all rows as a PyArrow Table. @@ -840,7 +872,16 @@ def to_arrow(self, order_by: str | list[str] | None = None, limit: int | None = return pyarrow.table({}) return pyarrow.Table.from_pylist(dicts) - def to_arrays(self, *attrs: str, include_key: bool = False, order_by: str | list[str] | None = None, limit: int | None = None, offset: int | None = None, squeeze: bool = False, download_path: str = ".") -> np.ndarray | tuple[np.ndarray, ...]: + def to_arrays( + self, + *attrs: str, + include_key: bool = False, + order_by: str | list[str] | None = None, + limit: int | None = None, + offset: int | None = None, + squeeze: bool = False, + download_path: str = ".", + ) -> np.ndarray | tuple[np.ndarray, ...]: """ Fetch data as numpy arrays. @@ -930,7 +971,9 @@ def to_arrays(self, *attrs: str, include_key: bool = False, order_by: str | list ret[name] = list(map(partial(get, heading[name]), ret[name])) return ret - def keys(self, order_by: str | list[str] | None = None, limit: int | None = None, offset: int | None = None) -> list[dict[str, Any]]: + def keys( + self, order_by: str | list[str] | None = None, limit: int | None = None, offset: int | None = None + ) -> list[dict[str, Any]]: """ Fetch primary key values as a list of dictionaries. diff --git a/src/datajoint/heading.py b/src/datajoint/heading.py index 7d35c8263..96e01f985 100644 --- a/src/datajoint/heading.py +++ b/src/datajoint/heading.py @@ -26,7 +26,7 @@ from .lineage import get_table_lineages, lineage_table_exists if TYPE_CHECKING: - from .connection import Connection + pass class _MissingType(Codec, register=False): diff --git a/src/datajoint/schemas.py b/src/datajoint/schemas.py index 513225879..1f038321c 100644 --- a/src/datajoint/schemas.py +++ b/src/datajoint/schemas.py @@ -21,7 +21,6 @@ if TYPE_CHECKING: from .connection import Connection - from .table import Table from .heading import Heading from .jobs import Job from .settings import config diff --git a/src/datajoint/storage.py b/src/datajoint/storage.py index d59a46357..0d401dbdf 100644 --- a/src/datajoint/storage.py +++ b/src/datajoint/storage.py @@ -5,6 +5,8 @@ backends (local filesystem, S3, GCS, Azure, etc.) using the fsspec library. """ +from __future__ import annotations + import json import logging import secrets @@ -30,11 +32,15 @@ def is_remote_url(path: str) -> bool: """ Check if a path is a remote URL. - Args: - path: Path string to check + Parameters + ---------- + path : str + Path string to check. - Returns: - True if path is a remote URL + Returns + ------- + bool + True if path starts with a supported remote protocol. """ if not isinstance(path, str): return False @@ -45,11 +51,20 @@ def parse_remote_url(url: str) -> tuple[str, str]: """ Parse a remote URL into protocol and path. - Args: - url: Remote URL (e.g., 's3://bucket/path/file.dat') + Parameters + ---------- + url : str + Remote URL (e.g., ``'s3://bucket/path/file.dat'``). + + Returns + ------- + tuple[str, str] + ``(protocol, path)`` where protocol is fsspec-compatible. - Returns: - Tuple of (protocol, path) where protocol is fsspec-compatible + Raises + ------ + DataJointError + If URL protocol is not supported. """ url_lower = url.lower() @@ -76,11 +91,15 @@ def generate_token(length: int = 8) -> str: """ Generate a random token for filename collision avoidance. - Args: - length: Token length (4-16 characters, default 8) + Parameters + ---------- + length : int, optional + Token length, clamped to 4-16 characters. Default 8. - Returns: - Random URL-safe string + Returns + ------- + str + Random URL-safe string. """ length = max(4, min(16, length)) return "".join(secrets.choice(TOKEN_ALPHABET) for _ in range(length)) @@ -90,11 +109,15 @@ def encode_pk_value(value: Any) -> str: """ Encode a primary key value for use in storage paths. - Args: - value: Primary key value (int, str, date, etc.) + Parameters + ---------- + value : any + Primary key value (int, str, date, datetime, etc.). - Returns: - Path-safe string representation + Returns + ------- + str + Path-safe string representation. """ if isinstance(value, (int, float)): return str(value) @@ -133,17 +156,27 @@ def build_object_path( """ Build the storage path for an object attribute. - Args: - schema: Schema name - table: Table name - field: Field/attribute name - primary_key: Dict of primary key attribute names to values - ext: File extension (e.g., ".dat") or None - partition_pattern: Optional partition pattern with {attr} placeholders - token_length: Length of random token suffix - - Returns: - Tuple of (relative_path, token) + Parameters + ---------- + schema : str + Schema name. + table : str + Table name. + field : str + Field/attribute name. + primary_key : dict[str, Any] + Dict of primary key attribute names to values. + ext : str or None + File extension (e.g., ``".dat"``). + partition_pattern : str, optional + Partition pattern with ``{attr}`` placeholders. + token_length : int, optional + Length of random token suffix. Default 8. + + Returns + ------- + tuple[str, str] + ``(relative_path, token)``. """ token = generate_token(token_length) @@ -196,22 +229,36 @@ class StorageBackend: Provides a consistent interface for file operations across different storage backends including local filesystem and cloud object storage (S3, GCS, Azure). + + Parameters + ---------- + spec : dict[str, Any] + Storage configuration dictionary. See ``__init__`` for details. + + Attributes + ---------- + spec : dict + Storage configuration dictionary. + protocol : str + Storage protocol (``'file'``, ``'s3'``, ``'gcs'``, ``'azure'``). """ - def __init__(self, spec: dict[str, Any]): + def __init__(self, spec: dict[str, Any]) -> None: """ Initialize storage backend from configuration spec. - Args: - spec: Storage configuration dictionary containing: - - protocol: Storage protocol ('file', 's3', 'gcs', 'azure') - - location: Base path or bucket prefix - - bucket: Bucket name (for cloud storage) - - endpoint: Endpoint URL (for S3-compatible storage) - - access_key: Access key (for cloud storage) - - secret_key: Secret key (for cloud storage) - - secure: Use HTTPS (default: True for cloud) - - Additional protocol-specific options + Parameters + ---------- + spec : dict[str, Any] + Storage configuration dictionary containing: + + - ``protocol``: Storage protocol (``'file'``, ``'s3'``, ``'gcs'``, ``'azure'``) + - ``location``: Base path or bucket prefix + - ``bucket``: Bucket name (for cloud storage) + - ``endpoint``: Endpoint URL (for S3-compatible storage) + - ``access_key``: Access key (for cloud storage) + - ``secret_key``: Secret key (for cloud storage) + - ``secure``: Use HTTPS (default True for cloud) """ self.spec = spec self.protocol = spec.get("protocol", "file") @@ -281,11 +328,15 @@ def _full_path(self, path: str | PurePosixPath) -> str: """ Construct full path including location/bucket prefix. - Args: - path: Relative path within the storage location + Parameters + ---------- + path : str or PurePosixPath + Relative path within the storage location. - Returns: - Full path suitable for fsspec operations + Returns + ------- + str + Full path suitable for fsspec operations. """ path = str(path) if self.protocol == "s3": @@ -307,14 +358,18 @@ def _full_path(self, path: str | PurePosixPath) -> str: return str(Path(location) / path) return path - def put_file(self, local_path: str | Path, remote_path: str | PurePosixPath, metadata: dict | None = None): + def put_file(self, local_path: str | Path, remote_path: str | PurePosixPath, metadata: dict | None = None) -> None: """ Upload a file from local filesystem to storage. - Args: - local_path: Path to local file - remote_path: Destination path in storage - metadata: Optional metadata to attach to the file + Parameters + ---------- + local_path : str or Path + Path to local file. + remote_path : str or PurePosixPath + Destination path in storage. + metadata : dict, optional + Metadata to attach to the file (cloud storage only). """ full_path = self._full_path(remote_path) logger.debug(f"put_file: {local_path} -> {self.protocol}:{full_path}") @@ -329,13 +384,16 @@ def put_file(self, local_path: str | Path, remote_path: str | PurePosixPath, met # For cloud storage, use fsspec put self.fs.put_file(str(local_path), full_path) - def get_file(self, remote_path: str | PurePosixPath, local_path: str | Path): + def get_file(self, remote_path: str | PurePosixPath, local_path: str | Path) -> None: """ Download a file from storage to local filesystem. - Args: - remote_path: Path in storage - local_path: Destination path on local filesystem + Parameters + ---------- + remote_path : str or PurePosixPath + Path in storage. + local_path : str or Path + Destination path on local filesystem. """ full_path = self._full_path(remote_path) logger.debug(f"get_file: {self.protocol}:{full_path} -> {local_path}") @@ -350,13 +408,16 @@ def get_file(self, remote_path: str | PurePosixPath, local_path: str | Path): else: self.fs.get_file(full_path, str(local_path)) - def put_buffer(self, buffer: bytes, remote_path: str | PurePosixPath): + def put_buffer(self, buffer: bytes, remote_path: str | PurePosixPath) -> None: """ Write bytes to storage. - Args: - buffer: Bytes to write - remote_path: Destination path in storage + Parameters + ---------- + buffer : bytes + Bytes to write. + remote_path : str or PurePosixPath + Destination path in storage. """ full_path = self._full_path(remote_path) logger.debug(f"put_buffer: {len(buffer)} bytes -> {self.protocol}:{full_path}") @@ -373,11 +434,20 @@ def get_buffer(self, remote_path: str | PurePosixPath) -> bytes: """ Read bytes from storage. - Args: - remote_path: Path in storage + Parameters + ---------- + remote_path : str or PurePosixPath + Path in storage. + + Returns + ------- + bytes + File contents. - Returns: - File contents as bytes + Raises + ------ + MissingExternalFile + If the file does not exist. """ full_path = self._full_path(remote_path) logger.debug(f"get_buffer: {self.protocol}:{full_path}") @@ -394,11 +464,15 @@ def exists(self, remote_path: str | PurePosixPath) -> bool: """ Check if a file exists in storage. - Args: - remote_path: Path in storage + Parameters + ---------- + remote_path : str or PurePosixPath + Path in storage. - Returns: - True if file exists + Returns + ------- + bool + True if file exists. """ full_path = self._full_path(remote_path) logger.debug(f"exists: {self.protocol}:{full_path}") @@ -408,12 +482,14 @@ def exists(self, remote_path: str | PurePosixPath) -> bool: else: return self.fs.exists(full_path) - def remove(self, remote_path: str | PurePosixPath): + def remove(self, remote_path: str | PurePosixPath) -> None: """ Remove a file from storage. - Args: - remote_path: Path in storage + Parameters + ---------- + remote_path : str or PurePosixPath + Path in storage. """ full_path = self._full_path(remote_path) logger.debug(f"remove: {self.protocol}:{full_path}") @@ -430,11 +506,15 @@ def size(self, remote_path: str | PurePosixPath) -> int: """ Get file size in bytes. - Args: - remote_path: Path in storage + Parameters + ---------- + remote_path : str or PurePosixPath + Path in storage. - Returns: - File size in bytes + Returns + ------- + int + File size in bytes. """ full_path = self._full_path(remote_path) @@ -447,12 +527,17 @@ def open(self, remote_path: str | PurePosixPath, mode: str = "rb"): """ Open a file in storage. - Args: - remote_path: Path in storage - mode: File mode ('rb', 'wb', etc.) + Parameters + ---------- + remote_path : str or PurePosixPath + Path in storage. + mode : str, optional + File mode (``'rb'``, ``'wb'``, etc.). Default ``'rb'``. - Returns: - File-like object + Returns + ------- + file-like + File-like object for reading or writing. """ full_path = self._full_path(remote_path) @@ -466,12 +551,18 @@ def put_folder(self, local_path: str | Path, remote_path: str | PurePosixPath) - """ Upload a folder to storage. - Args: - local_path: Path to local folder - remote_path: Destination path in storage + Parameters + ---------- + local_path : str or Path + Path to local folder. + remote_path : str or PurePosixPath + Destination path in storage. - Returns: - Manifest dict with file list, total_size, and item_count + Returns + ------- + dict + Manifest with keys ``'files'``, ``'total_size'``, ``'item_count'``, + ``'created'``. """ local_path = Path(local_path) if not local_path.is_dir(): @@ -524,12 +615,14 @@ def put_folder(self, local_path: str | Path, remote_path: str | PurePosixPath) - return manifest - def remove_folder(self, remote_path: str | PurePosixPath): + def remove_folder(self, remote_path: str | PurePosixPath) -> None: """ Remove a folder and its manifest from storage. - Args: - remote_path: Path to folder in storage + Parameters + ---------- + remote_path : str or PurePosixPath + Path to folder in storage. """ full_path = self._full_path(remote_path) logger.debug(f"remove_folder: {self.protocol}:{full_path}") @@ -552,11 +645,15 @@ def get_fsmap(self, remote_path: str | PurePosixPath) -> fsspec.FSMap: """ Get an FSMap for a path (useful for Zarr/xarray). - Args: - remote_path: Path in storage + Parameters + ---------- + remote_path : str or PurePosixPath + Path in storage. - Returns: - fsspec.FSMap instance + Returns + ------- + fsspec.FSMap + Mapping interface for the storage path. """ full_path = self._full_path(remote_path) return fsspec.FSMap(full_path, self.fs) @@ -565,12 +662,17 @@ def copy_from_url(self, source_url: str, dest_path: str | PurePosixPath) -> int: """ Copy a file from a remote URL to managed storage. - Args: - source_url: Remote URL (s3://, gs://, http://, etc.) - dest_path: Destination path in managed storage + Parameters + ---------- + source_url : str + Remote URL (``s3://``, ``gs://``, ``http://``, etc.). + dest_path : str or PurePosixPath + Destination path in managed storage. - Returns: - Size of copied file in bytes + Returns + ------- + int + Size of copied file in bytes. """ protocol, source_path = parse_remote_url(source_url) full_dest = self._full_path(dest_path) @@ -603,13 +705,20 @@ def _copy_folder_from_url( """ Copy a folder from a remote URL to managed storage. - Args: - source_fs: Source filesystem - source_path: Path in source filesystem - dest_path: Destination path in managed storage - - Returns: - Manifest dict with file list, total_size, and item_count + Parameters + ---------- + source_fs : fsspec.AbstractFileSystem + Source filesystem. + source_path : str + Path in source filesystem. + dest_path : str or PurePosixPath + Destination path in managed storage. + + Returns + ------- + dict + Manifest with keys ``'files'``, ``'total_size'``, ``'item_count'``, + ``'created'``. """ full_dest = self._full_path(dest_path) logger.debug(f"copy_folder_from_url: {source_path} -> {self.protocol}:{full_dest}") @@ -655,11 +764,15 @@ def source_is_directory(self, source: str) -> bool: """ Check if a source path (local or remote URL) is a directory. - Args: - source: Local path or remote URL + Parameters + ---------- + source : str + Local path or remote URL. - Returns: - True if source is a directory + Returns + ------- + bool + True if source is a directory. """ if is_remote_url(source): protocol, path = parse_remote_url(source) @@ -672,11 +785,15 @@ def source_exists(self, source: str) -> bool: """ Check if a source path (local or remote URL) exists. - Args: - source: Local path or remote URL + Parameters + ---------- + source : str + Local path or remote URL. - Returns: - True if source exists + Returns + ------- + bool + True if source exists. """ if is_remote_url(source): protocol, path = parse_remote_url(source) @@ -689,11 +806,15 @@ def get_source_size(self, source: str) -> int | None: """ Get the size of a source file (local or remote URL). - Args: - source: Local path or remote URL + Parameters + ---------- + source : str + Local path or remote URL. - Returns: - Size in bytes, or None if directory or cannot determine + Returns + ------- + int or None + Size in bytes, or None if directory or cannot determine. """ try: if is_remote_url(source): @@ -718,11 +839,15 @@ def get_storage_backend(spec: dict[str, Any]) -> StorageBackend: """ Factory function to create a storage backend from configuration. - Args: - spec: Storage configuration dictionary + Parameters + ---------- + spec : dict[str, Any] + Storage configuration dictionary. - Returns: - StorageBackend instance + Returns + ------- + StorageBackend + Configured storage backend instance. """ return StorageBackend(spec) @@ -731,18 +856,25 @@ def verify_or_create_store_metadata(backend: StorageBackend, spec: dict[str, Any """ Verify or create the store metadata file at the storage root. - On first use, creates the datajoint_store.json file with project info. - On subsequent uses, verifies the project_name matches. - - Args: - backend: StorageBackend instance - spec: Object storage configuration spec - - Returns: - Store metadata dict - - Raises: - DataJointError: If project_name mismatch detected + On first use, creates the ``datajoint_store.json`` file with project info. + On subsequent uses, verifies the ``project_name`` matches. + + Parameters + ---------- + backend : StorageBackend + Storage backend instance. + spec : dict[str, Any] + Object storage configuration spec. + + Returns + ------- + dict + Store metadata dictionary. + + Raises + ------ + DataJointError + If ``project_name`` mismatch detected. """ from .version import __version__ as dj_version diff --git a/src/datajoint/table.py b/src/datajoint/table.py index d7c9fa724..fb1615a11 100644 --- a/src/datajoint/table.py +++ b/src/datajoint/table.py @@ -20,7 +20,6 @@ if TYPE_CHECKING: from collections.abc import Iterable, Mapping - from .connection import Connection from .expression import QueryExpression from .declare import alter, declare from .errors import ( From 8dbe50f4eb7b2cdc1a57e76b41e21819e0b1937c Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Sun, 4 Jan 2026 12:38:50 -0600 Subject: [PATCH 03/15] docs: Continue docstring harmonization to NumPy style (batch 3) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Harmonized modules: - settings.py - Configuration system (pydantic-settings) - errors.py - Exception hierarchy - builtin_codecs.py - Built-in codec implementations All modules now use: - NumPy-style docstrings (Parameters, Returns, Raises, etc.) - `from __future__ import annotations` for deferred evaluation - Consistent single-line docstrings for simple classes/methods πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- src/datajoint/builtin_codecs.py | 153 +++++++++++++-------- src/datajoint/errors.py | 69 ++++------ src/datajoint/settings.py | 229 ++++++++++++++++++++------------ 3 files changed, 268 insertions(+), 183 deletions(-) diff --git a/src/datajoint/builtin_codecs.py b/src/datajoint/builtin_codecs.py index a55494e82..66589dc36 100644 --- a/src/datajoint/builtin_codecs.py +++ b/src/datajoint/builtin_codecs.py @@ -167,13 +167,19 @@ def encode(self, value: bytes, *, key: dict | None = None, store_name: str | Non """ Store content and return metadata. - Args: - value: Raw bytes to store. - key: Primary key values (unused). - store_name: Store to use. If None, uses default store. - - Returns: - Metadata dict: {hash, store, size} + Parameters + ---------- + value : bytes + Raw bytes to store. + key : dict, optional + Primary key values (unused). + store_name : str, optional + Store to use. If None, uses default store. + + Returns + ------- + dict + Metadata dict: ``{hash, store, size}``. """ from .content_registry import put_content @@ -183,11 +189,16 @@ def decode(self, stored: dict, *, key: dict | None = None) -> bytes: """ Retrieve content by hash. - Args: - stored: Metadata dict with 'hash' and optionally 'store'. - key: Primary key values (unused). + Parameters + ---------- + stored : dict + Metadata dict with ``'hash'`` and optionally ``'store'``. + key : dict, optional + Primary key values (unused). - Returns: + Returns + ------- + bytes Original bytes. """ from .content_registry import get_content @@ -275,19 +286,20 @@ def encode( """ Store content and return metadata. - Args: - value: Content to store. Can be: - - bytes: Raw bytes to store as file - - str/Path: Path to local file or folder to upload - key: Dict containing context for path construction: - - _schema: Schema name - - _table: Table name - - _field: Field/attribute name - - Other entries are primary key values - store_name: Store to use. If None, uses default store. - - Returns: - Metadata dict suitable for ObjectRef.from_json() + Parameters + ---------- + value : bytes, str, or Path + Content to store: bytes (raw data), or str/Path (file/folder to upload). + key : dict, optional + Context for path construction with keys ``_schema``, ``_table``, + ``_field``, plus primary key values. + store_name : str, optional + Store to use. If None, uses default store. + + Returns + ------- + dict + Metadata dict suitable for ``ObjectRef.from_json()``. """ from datetime import datetime, timezone from pathlib import Path @@ -381,12 +393,17 @@ def decode(self, stored: dict, *, key: dict | None = None) -> Any: """ Create ObjectRef handle for lazy access. - Args: - stored: Metadata dict from database. - key: Primary key values (unused). - - Returns: - ObjectRef for accessing the stored content. + Parameters + ---------- + stored : dict + Metadata dict from database. + key : dict, optional + Primary key values (unused). + + Returns + ------- + ObjectRef + Handle for accessing the stored content. """ from .objectref import ObjectRef from .content_registry import get_store_backend @@ -396,7 +413,7 @@ def decode(self, stored: dict, *, key: dict | None = None) -> Any: return ObjectRef.from_json(stored, backend=backend) def validate(self, value: Any) -> None: - """Validate that value is bytes, path, dict metadata, or (extension, data) tuple.""" + """Validate value is bytes, path, dict metadata, or (ext, data) tuple.""" from pathlib import Path if isinstance(value, bytes): @@ -463,13 +480,19 @@ def encode(self, value: Any, *, key: dict | None = None, store_name: str | None """ Read file and encode as filename + contents. - Args: - value: Path to file (str or Path). - key: Primary key values (unused). - store_name: Unused for internal storage. - - Returns: - Bytes: filename (UTF-8) + null byte + file contents + Parameters + ---------- + value : str or Path + Path to file. + key : dict, optional + Primary key values (unused). + store_name : str, optional + Unused for internal storage. + + Returns + ------- + bytes + Filename (UTF-8) + null byte + file contents. """ from pathlib import Path @@ -487,12 +510,17 @@ def decode(self, stored: bytes, *, key: dict | None = None) -> str: """ Extract file to download path and return local path. - Args: - stored: Blob containing filename + null + contents. - key: Primary key values (unused). - - Returns: - Path to extracted file as string. + Parameters + ---------- + stored : bytes + Blob containing filename + null + contents. + key : dict, optional + Primary key values (unused). + + Returns + ------- + str + Path to extracted file. """ from pathlib import Path @@ -592,13 +620,19 @@ def encode(self, value: Any, *, key: dict | None = None, store_name: str | None """ Store path reference as JSON metadata. - Args: - value: Relative path within the store (str). - key: Primary key values (unused). - store_name: Store where the file exists. - - Returns: - Metadata dict: {path, store} + Parameters + ---------- + value : str + Relative path within the store. + key : dict, optional + Primary key values (unused). + store_name : str, optional + Store where the file exists. + + Returns + ------- + dict + Metadata dict: ``{path, store}``. """ from datetime import datetime, timezone @@ -629,12 +663,17 @@ def decode(self, stored: dict, *, key: dict | None = None) -> Any: """ Create ObjectRef handle for lazy access. - Args: - stored: Metadata dict with path and store. - key: Primary key values (unused). - - Returns: - ObjectRef for accessing the file. + Parameters + ---------- + stored : dict + Metadata dict with path and store. + key : dict, optional + Primary key values (unused). + + Returns + ------- + ObjectRef + Handle for accessing the file. """ from .objectref import ObjectRef from .content_registry import get_store_backend diff --git a/src/datajoint/errors.py b/src/datajoint/errors.py index aadc74caf..d2a789692 100644 --- a/src/datajoint/errors.py +++ b/src/datajoint/errors.py @@ -1,87 +1,74 @@ """ -Exception classes for the DataJoint library +Exception classes for the DataJoint library. + +This module defines the exception hierarchy for DataJoint errors. """ +from __future__ import annotations + # --- Top Level --- class DataJointError(Exception): - """ - Base class for errors specific to DataJoint internal operation. - """ + """Base class for errors specific to DataJoint internal operation.""" - def suggest(self, *args): + def suggest(self, *args) -> "DataJointError": """ - regenerate the exception with additional arguments + Regenerate the exception with additional arguments. + + Parameters + ---------- + *args : any + Additional arguments to append to the exception. - :param args: addition arguments - :return: a new exception of the same type with the additional arguments + Returns + ------- + DataJointError + A new exception of the same type with the additional arguments. """ return self.__class__(*(self.args + args)) # --- Second Level --- class LostConnectionError(DataJointError): - """ - Loss of server connection - """ + """Loss of server connection.""" class QueryError(DataJointError): - """ - Errors arising from queries to the database - """ + """Errors arising from queries to the database.""" # --- Third Level: QueryErrors --- class QuerySyntaxError(QueryError): - """ - Errors arising from incorrect query syntax - """ + """Errors arising from incorrect query syntax.""" class AccessError(QueryError): - """ - User access error: insufficient privileges. - """ + """User access error: insufficient privileges.""" class MissingTableError(DataJointError): - """ - Query on a table that has not been declared - """ + """Query on a table that has not been declared.""" class DuplicateError(QueryError): - """ - An integrity error caused by a duplicate entry into a unique key - """ + """Integrity error caused by a duplicate entry into a unique key.""" class IntegrityError(QueryError): - """ - An integrity error triggered by foreign key constraints - """ + """Integrity error triggered by foreign key constraints.""" class UnknownAttributeError(QueryError): - """ - User requests an attribute name not found in query heading - """ + """User requests an attribute name not found in query heading.""" class MissingAttributeError(QueryError): - """ - An error arising when a required attribute value is not provided in INSERT - """ + """Required attribute value not provided in INSERT.""" class MissingExternalFile(DataJointError): - """ - Error raised when an external file managed by DataJoint is no longer accessible - """ + """External file managed by DataJoint is no longer accessible.""" class BucketInaccessible(DataJointError): - """ - Error raised when a S3 bucket is inaccessible - """ + """S3 bucket is inaccessible.""" diff --git a/src/datajoint/settings.py b/src/datajoint/settings.py index 22912d223..1c43b1ed2 100644 --- a/src/datajoint/settings.py +++ b/src/datajoint/settings.py @@ -1,21 +1,26 @@ """ -DataJoint Settings using pydantic-settings. +DataJoint configuration system using pydantic-settings. -This module provides a strongly-typed configuration system for DataJoint. +This module provides strongly-typed configuration with automatic loading +from environment variables, secrets directories, and JSON config files. Configuration sources (in priority order): -1. Environment variables (DJ_*) -2. Secrets directories (.secrets/ in project, /run/secrets/datajoint/) -3. Project config file (datajoint.json, searched recursively up to .git/.hg) - -Example usage: - >>> import datajoint as dj - >>> dj.config.database.host - 'localhost' - >>> with dj.config.override(safemode=False): - ... # dangerous operations here - -Project structure: + +1. Environment variables (``DJ_*``) +2. Secrets directories (``.secrets/`` in project, ``/run/secrets/datajoint/``) +3. Project config file (``datajoint.json``, searched recursively up to ``.git/.hg``) + +Examples +-------- +>>> import datajoint as dj +>>> dj.config.database.host +'localhost' +>>> with dj.config.override(safemode=False): +... # dangerous operations here +... pass + +Project structure:: + myproject/ β”œβ”€β”€ .git/ β”œβ”€β”€ datajoint.json # Project config (commit this) @@ -26,6 +31,8 @@ └── analysis.py # Config found via parent search """ +from __future__ import annotations + import json import logging import os @@ -75,13 +82,17 @@ def find_config_file(start: Path | None = None) -> Path | None: """ Search for datajoint.json in current and parent directories. - Searches upward from `start` (default: cwd) until finding the config file - or hitting a project boundary (.git, .hg) or filesystem root. + Searches upward from ``start`` until finding the config file or hitting + a project boundary (``.git``, ``.hg``) or filesystem root. - Args: - start: Directory to start search from. Defaults to current working directory. + Parameters + ---------- + start : Path, optional + Directory to start search from. Defaults to current working directory. - Returns: + Returns + ------- + Path or None Path to config file if found, None otherwise. """ current = (start or Path.cwd()).resolve() @@ -107,13 +118,18 @@ def find_secrets_dir(config_path: Path | None = None) -> Path | None: Find the secrets directory. Priority: - 1. .secrets/ in same directory as datajoint.json (project secrets) - 2. /run/secrets/datajoint/ (Docker/Kubernetes secrets) - Args: - config_path: Path to datajoint.json if found. + 1. ``.secrets/`` in same directory as datajoint.json (project secrets) + 2. ``/run/secrets/datajoint/`` (Docker/Kubernetes secrets) + + Parameters + ---------- + config_path : Path, optional + Path to datajoint.json if found. - Returns: + Returns + ------- + Path or None Path to secrets directory if found, None otherwise. """ # Check project secrets directory (next to config file) @@ -133,11 +149,16 @@ def read_secret_file(secrets_dir: Path | None, name: str) -> str | None: """ Read a secret value from a file in the secrets directory. - Args: - secrets_dir: Path to secrets directory. - name: Name of the secret file (e.g., 'database.password'). + Parameters + ---------- + secrets_dir : Path or None + Path to secrets directory. + name : str + Name of the secret file (e.g., ``'database.password'``). - Returns: + Returns + ------- + str or None Secret value as string, or None if not found. """ if secrets_dir is None: @@ -268,18 +289,23 @@ class Config(BaseSettings): Main DataJoint configuration. Settings are loaded from (in priority order): - 1. Environment variables (DJ_*) - 2. Secrets directory (.secrets/ or /run/secrets/datajoint/) - 3. Config file (datajoint.json, searched in parent directories) + + 1. Environment variables (``DJ_*``) + 2. Secrets directory (``.secrets/`` or ``/run/secrets/datajoint/``) + 3. Config file (``datajoint.json``, searched in parent directories) 4. Default values + Examples + -------- Access settings via attributes: - >>> config.database.host - >>> config.safemode + + >>> config.database.host + >>> config.safemode Override temporarily with context manager: - >>> with config.override(safemode=False): - ... pass + + >>> with config.override(safemode=False): + ... pass """ model_config = SettingsConfigDict( @@ -336,14 +362,20 @@ def get_store_spec(self, store: str) -> dict[str, Any]: """ Get configuration for an external store. - Args: - store: Name of the store to retrieve + Parameters + ---------- + store : str + Name of the store to retrieve. - Returns: - Store configuration dict with validated fields + Returns + ------- + dict[str, Any] + Store configuration dict with validated fields. - Raises: - DataJointError: If store is not configured or has invalid config + Raises + ------ + DataJointError + If store is not configured or has invalid config. """ if store not in self.stores: raise DataJointError(f"Storage '{store}' is requested but not configured") @@ -418,11 +450,15 @@ def get_object_storage_spec(self) -> dict[str, Any]: """ Get validated object storage configuration. - Returns: - Object storage configuration dict + Returns + ------- + dict[str, Any] + Object storage configuration dict. - Raises: - DataJointError: If object storage is not configured or has invalid config + Raises + ------ + DataJointError + If object storage is not configured or has invalid config. """ os_settings = self.object_storage @@ -485,14 +521,20 @@ def get_object_store_spec(self, store_name: str | None = None) -> dict[str, Any] """ Get validated configuration for a specific object store. - Args: - store_name: Name of the store (None for default store) + Parameters + ---------- + store_name : str, optional + Name of the store. None for default store. - Returns: - Object store configuration dict + Returns + ------- + dict[str, Any] + Object store configuration dict. - Raises: - DataJointError: If store is not configured or has invalid config + Raises + ------ + DataJointError + If store is not configured or has invalid config. """ if store_name is None: # Return default store spec @@ -567,8 +609,10 @@ def load(self, filename: str | Path) -> None: """ Load settings from a JSON file. - Args: - filename: Path to load configuration from. + Parameters + ---------- + filename : str or Path + Path to load configuration from. """ filepath = Path(filename) if not filepath.exists(): @@ -659,18 +703,23 @@ def override(self, **kwargs: Any) -> Iterator["Config"]: """ Temporarily override configuration values. - Args: - **kwargs: Settings to override. Use double underscore for nested - settings (e.g., database__host="localhost") - - Yields: - The config instance with overridden values - - Example: - >>> with config.override(safemode=False, database__host="test"): - ... # config.safemode is False here - ... pass - >>> # config.safemode is restored + Parameters + ---------- + **kwargs : Any + Settings to override. Use double underscore for nested settings + (e.g., ``database__host="localhost"``). + + Yields + ------ + Config + The config instance with overridden values. + + Examples + -------- + >>> with config.override(safemode=False, database__host="test"): + ... # config.safemode is False here + ... pass + >>> # config.safemode is restored """ # Store original values backup = {} @@ -721,26 +770,36 @@ def save_template( Create a template datajoint.json configuration file. Credentials should NOT be stored in datajoint.json. Instead, use either: - - Environment variables (DJ_USER, DJ_PASS, DJ_HOST, etc.) - - The .secrets/ directory (created alongside datajoint.json) - - Args: - path: Where to save the template. Defaults to 'datajoint.json' in current directory. - minimal: If True (default), create a minimal template with just database settings. - If False, create a full template with all available settings. - create_secrets_dir: If True (default), also create a .secrets/ directory - with template files for credentials. - - Returns: - Path to the created config file. - - Raises: - FileExistsError: If config file already exists (won't overwrite). - - Example: - >>> import datajoint as dj - >>> dj.config.save_template() # Creates minimal template + .secrets/ - >>> dj.config.save_template("full-config.json", minimal=False) + + - Environment variables (``DJ_USER``, ``DJ_PASS``, ``DJ_HOST``, etc.) + - The ``.secrets/`` directory (created alongside datajoint.json) + + Parameters + ---------- + path : str or Path, optional + Where to save the template. Default ``'datajoint.json'``. + minimal : bool, optional + If True (default), create minimal template with just database settings. + If False, create full template with all available settings. + create_secrets_dir : bool, optional + If True (default), also create a ``.secrets/`` directory with + template files for credentials. + + Returns + ------- + Path + Absolute path to the created config file. + + Raises + ------ + FileExistsError + If config file already exists (won't overwrite). + + Examples + -------- + >>> import datajoint as dj + >>> dj.config.save_template() # Creates minimal template + .secrets/ + >>> dj.config.save_template("full-config.json", minimal=False) """ filepath = Path(path) if filepath.exists(): From 2f3e88b05c5904726d192948461176675dd92c5d Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Sun, 4 Jan 2026 13:18:03 -0600 Subject: [PATCH 04/15] docs: Continue docstring harmonization to NumPy style (batch 4) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Harmonized modules: - gc.py - Garbage collection for external storage - migrate.py - Schema migration utilities - lineage.py - Attribute lineage management - objectref.py - ObjectRef class for fetched objects - content_registry.py - Content-addressed storage - admin.py - Database connection management Converted from Google/Sphinx style to NumPy style docstrings. πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- src/datajoint/admin.py | 58 +++++--- src/datajoint/content_registry.py | 134 ++++++++++++------ src/datajoint/gc.py | 184 ++++++++++++++++-------- src/datajoint/lineage.py | 108 ++++++++++---- src/datajoint/migrate.py | 228 +++++++++++++++++------------- src/datajoint/objectref.py | 136 +++++++++++------- 6 files changed, 550 insertions(+), 298 deletions(-) diff --git a/src/datajoint/admin.py b/src/datajoint/admin.py index 64a91bb48..275e9a823 100644 --- a/src/datajoint/admin.py +++ b/src/datajoint/admin.py @@ -9,18 +9,24 @@ def kill(restriction=None, connection=None, order_by=None): """ - view and kill database connections. - - :param restriction: restriction to be applied to processlist - :param connection: a datajoint.Connection object. Default calls datajoint.conn() - :param order_by: order by a single attribute or the list of attributes. defaults to 'id'. - - Restrictions are specified as strings and can involve any of the attributes of - information_schema.processlist: ID, USER, HOST, DB, COMMAND, TIME, STATE, INFO. - - Examples: - dj.kill('HOST LIKE "%compute%"') lists only connections from hosts containing "compute". - dj.kill('TIME > 600') lists only connections in their current state for more than 10 minutes + View and kill database connections interactively. + + Displays a list of active connections and prompts for connections to kill. + + Parameters + ---------- + restriction : str, optional + SQL WHERE clause to filter connections. Can use any attribute from + information_schema.processlist: ID, USER, HOST, DB, COMMAND, TIME, STATE, INFO. + connection : Connection, optional + A datajoint.Connection object. Defaults to datajoint.conn(). + order_by : str or list[str], optional + Attribute(s) to order results by. Defaults to 'id'. + + Examples + -------- + >>> dj.kill('HOST LIKE "%compute%"') # List connections from hosts containing "compute" + >>> dj.kill('TIME > 600') # List connections idle for more than 10 minutes """ if connection is None: @@ -61,16 +67,24 @@ def kill(restriction=None, connection=None, order_by=None): def kill_quick(restriction=None, connection=None): """ - Kill database connections without prompting. Returns number of terminated connections. - - :param restriction: restriction to be applied to processlist - :param connection: a datajoint.Connection object. Default calls datajoint.conn() - - Restrictions are specified as strings and can involve any of the attributes of - information_schema.processlist: ID, USER, HOST, DB, COMMAND, TIME, STATE, INFO. - - Examples: - dj.kill('HOST LIKE "%compute%"') terminates connections from hosts containing "compute". + Kill database connections without prompting. + + Parameters + ---------- + restriction : str, optional + SQL WHERE clause to filter connections. Can use any attribute from + information_schema.processlist: ID, USER, HOST, DB, COMMAND, TIME, STATE, INFO. + connection : Connection, optional + A datajoint.Connection object. Defaults to datajoint.conn(). + + Returns + ------- + int + Number of terminated connections. + + Examples + -------- + >>> dj.kill_quick('HOST LIKE "%compute%"') # Kill connections from hosts with "compute" """ if connection is None: connection = conn() diff --git a/src/datajoint/content_registry.py b/src/datajoint/content_registry.py index abed955a0..f5da65ff5 100644 --- a/src/datajoint/content_registry.py +++ b/src/datajoint/content_registry.py @@ -23,11 +23,15 @@ def compute_content_hash(data: bytes) -> str: """ Compute SHA256 hash of content. - Args: - data: Content bytes + Parameters + ---------- + data : bytes + Content bytes. - Returns: - Hex-encoded SHA256 hash (64 characters) + Returns + ------- + str + Hex-encoded SHA256 hash (64 characters). """ return hashlib.sha256(data).hexdigest() @@ -39,11 +43,15 @@ def build_content_path(content_hash: str) -> str: Content is stored in a hierarchical structure to avoid too many files in a single directory: _content/{hash[:2]}/{hash[2:4]}/{hash} - Args: - content_hash: SHA256 hex hash (64 characters) + Parameters + ---------- + content_hash : str + SHA256 hex hash (64 characters). - Returns: - Relative path within the store + Returns + ------- + str + Relative path within the store. """ if len(content_hash) != 64: raise DataJointError(f"Invalid content hash length: {len(content_hash)} (expected 64)") @@ -54,12 +62,16 @@ def get_store_backend(store_name: str | None = None) -> StorageBackend: """ Get a StorageBackend for content storage. - Args: - store_name: Name of the store to use. If None, uses the default object storage - configuration or the configured default_store. + Parameters + ---------- + store_name : str, optional + Name of the store to use. If None, uses the default object storage + configuration or the configured default_store. - Returns: - StorageBackend instance + Returns + ------- + StorageBackend + StorageBackend instance. """ # If store_name is None, check for configured default_store if store_name is None and config.object_storage.default_store: @@ -77,12 +89,17 @@ def put_content(data: bytes, store_name: str | None = None) -> dict[str, Any]: If the content already exists (same hash), it is not re-uploaded. Returns metadata including the hash, store, and size. - Args: - data: Content bytes to store - store_name: Name of the store. If None, uses default store. + Parameters + ---------- + data : bytes + Content bytes to store. + store_name : str, optional + Name of the store. If None, uses default store. - Returns: - Metadata dict with keys: hash, store, size + Returns + ------- + dict[str, Any] + Metadata dict with keys: hash, store, size. """ content_hash = compute_content_hash(data) path = build_content_path(content_hash) @@ -107,16 +124,24 @@ def get_content(content_hash: str, store_name: str | None = None) -> bytes: """ Retrieve content by its hash. - Args: - content_hash: SHA256 hex hash of the content - store_name: Name of the store. If None, uses default store. - - Returns: - Content bytes - - Raises: - MissingExternalFile: If content is not found - DataJointError: If hash verification fails + Parameters + ---------- + content_hash : str + SHA256 hex hash of the content. + store_name : str, optional + Name of the store. If None, uses default store. + + Returns + ------- + bytes + Content bytes. + + Raises + ------ + MissingExternalFile + If content is not found. + DataJointError + If hash verification fails. """ path = build_content_path(content_hash) backend = get_store_backend(store_name) @@ -135,12 +160,17 @@ def content_exists(content_hash: str, store_name: str | None = None) -> bool: """ Check if content exists in storage. - Args: - content_hash: SHA256 hex hash of the content - store_name: Name of the store. If None, uses default store. + Parameters + ---------- + content_hash : str + SHA256 hex hash of the content. + store_name : str, optional + Name of the store. If None, uses default store. - Returns: - True if content exists + Returns + ------- + bool + True if content exists. """ path = build_content_path(content_hash) backend = get_store_backend(store_name) @@ -151,15 +181,24 @@ def delete_content(content_hash: str, store_name: str | None = None) -> bool: """ Delete content from storage. - WARNING: This should only be called after verifying no references exist. + This should only be called after verifying no references exist. Use garbage collection to safely remove unreferenced content. - Args: - content_hash: SHA256 hex hash of the content - store_name: Name of the store. If None, uses default store. + Parameters + ---------- + content_hash : str + SHA256 hex hash of the content. + store_name : str, optional + Name of the store. If None, uses default store. - Returns: - True if content was deleted, False if it didn't exist + Returns + ------- + bool + True if content was deleted, False if it didn't exist. + + Warnings + -------- + This permanently deletes content. Ensure no references exist first. """ path = build_content_path(content_hash) backend = get_store_backend(store_name) @@ -175,12 +214,17 @@ def get_content_size(content_hash: str, store_name: str | None = None) -> int: """ Get the size of stored content. - Args: - content_hash: SHA256 hex hash of the content - store_name: Name of the store. If None, uses default store. - - Returns: - Size in bytes + Parameters + ---------- + content_hash : str + SHA256 hex hash of the content. + store_name : str, optional + Name of the store. If None, uses default store. + + Returns + ------- + int + Size in bytes. """ path = build_content_path(content_hash) backend = get_store_backend(store_name) diff --git a/src/datajoint/gc.py b/src/datajoint/gc.py index 1ab08789e..33ede63d2 100644 --- a/src/datajoint/gc.py +++ b/src/datajoint/gc.py @@ -46,11 +46,15 @@ def _uses_content_storage(attr) -> bool: - (chains to ) - (chains to ) - Args: - attr: Attribute from table heading - - Returns: - True if the attribute stores content hashes + Parameters + ---------- + attr : Attribute + Attribute from table heading. + + Returns + ------- + bool + True if the attribute stores content hashes. """ if not attr.codec: return False @@ -74,11 +78,15 @@ def _uses_object_storage(attr) -> bool: """ Check if an attribute uses path-addressed object storage. - Args: - attr: Attribute from table heading + Parameters + ---------- + attr : Attribute + Attribute from table heading. - Returns: - True if the attribute stores object paths + Returns + ------- + bool + True if the attribute stores object paths. """ if not attr.codec: return False @@ -91,11 +99,15 @@ def _extract_content_refs(value: Any) -> list[tuple[str, str | None]]: """ Extract content references from a stored value. - Args: - value: The stored value (could be JSON string or dict) + Parameters + ---------- + value : Any + The stored value (could be JSON string or dict). - Returns: - List of (content_hash, store_name) tuples + Returns + ------- + list[tuple[str, str | None]] + List of (content_hash, store_name) tuples. """ refs = [] @@ -120,11 +132,15 @@ def _extract_object_refs(value: Any) -> list[tuple[str, str | None]]: """ Extract object path references from a stored value. - Args: - value: The stored value (could be JSON string or dict) + Parameters + ---------- + value : Any + The stored value (could be JSON string or dict). - Returns: - List of (path, store_name) tuples + Returns + ------- + list[tuple[str, str | None]] + List of (path, store_name) tuples. """ refs = [] @@ -156,13 +172,19 @@ def scan_references( Examines all tables in the given schemas and extracts content hashes from columns that use content-addressed storage (, , ). - Args: - *schemas: Schema instances to scan - store_name: Only include references to this store (None = all stores) - verbose: Print progress information - - Returns: - Set of content hashes that are referenced + Parameters + ---------- + *schemas : Schema + Schema instances to scan. + store_name : str, optional + Only include references to this store (None = all stores). + verbose : bool, optional + Print progress information. + + Returns + ------- + set[str] + Set of content hashes that are referenced. """ referenced: set[str] = set() @@ -213,13 +235,19 @@ def scan_object_references( Examines all tables in the given schemas and extracts object paths from columns that use path-addressed storage (). - Args: - *schemas: Schema instances to scan - store_name: Only include references to this store (None = all stores) - verbose: Print progress information - - Returns: - Set of object paths that are referenced + Parameters + ---------- + *schemas : Schema + Schema instances to scan. + store_name : str, optional + Only include references to this store (None = all stores). + verbose : bool, optional + Print progress information. + + Returns + ------- + set[str] + Set of object paths that are referenced. """ referenced: set[str] = set() @@ -265,11 +293,15 @@ def list_stored_content(store_name: str | None = None) -> dict[str, int]: Scans the _content/ directory in the specified store and returns all content hashes found. - Args: - store_name: Store to scan (None = default store) + Parameters + ---------- + store_name : str, optional + Store to scan (None = default store). - Returns: - Dict mapping content_hash to size in bytes + Returns + ------- + dict[str, int] + Dict mapping content_hash to size in bytes. """ backend = get_store_backend(store_name) stored: dict[str, int] = {} @@ -315,11 +347,15 @@ def list_stored_objects(store_name: str | None = None) -> dict[str, int]: Scans for directories matching the object storage pattern: {schema}/{table}/objects/{pk}/{field}_{token}/ - Args: - store_name: Store to scan (None = default store) + Parameters + ---------- + store_name : str, optional + Store to scan (None = default store). - Returns: - Dict mapping object_path to size in bytes + Returns + ------- + dict[str, int] + Dict mapping object_path to size in bytes. """ backend = get_store_backend(store_name) stored: dict[str, int] = {} @@ -364,12 +400,17 @@ def delete_object(path: str, store_name: str | None = None) -> bool: """ Delete an object directory from storage. - Args: - path: Object path (relative to store root) - store_name: Store name (None = default store) - - Returns: - True if deleted, False if not found + Parameters + ---------- + path : str + Object path (relative to store root). + store_name : str, optional + Store name (None = default store). + + Returns + ------- + bool + True if deleted, False if not found. """ backend = get_store_backend(store_name) @@ -397,13 +438,20 @@ def scan( Scans both content-addressed storage (for , , ) and path-addressed storage (for ). - Args: - *schemas: Schema instances to scan - store_name: Store to check (None = default store) - verbose: Print progress information - - Returns: + Parameters + ---------- + *schemas : Schema + Schema instances to scan. + store_name : str, optional + Store to check (None = default store). + verbose : bool, optional + Print progress information. + + Returns + ------- + dict[str, Any] Dict with scan statistics: + - content_referenced: Number of content items referenced in database - content_stored: Number of content items in storage - content_orphaned: Number of unreferenced content items @@ -463,14 +511,22 @@ def collect( Scans the given schemas for content and object references, then removes any storage items that are not referenced. - Args: - *schemas: Schema instances to scan - store_name: Store to clean (None = default store) - dry_run: If True, report what would be deleted without deleting - verbose: Print progress information - - Returns: + Parameters + ---------- + *schemas : Schema + Schema instances to scan. + store_name : str, optional + Store to clean (None = default store). + dry_run : bool, optional + If True, report what would be deleted without deleting. Default True. + verbose : bool, optional + Print progress information. + + Returns + ------- + dict[str, Any] Dict with collection statistics: + - referenced: Total items referenced in database - stored: Total items in storage - orphaned: Total unreferenced items @@ -541,11 +597,15 @@ def format_stats(stats: dict[str, Any]) -> str: """ Format GC statistics as a human-readable string. - Args: - stats: Statistics dict from scan() or collect() + Parameters + ---------- + stats : dict[str, Any] + Statistics dict from scan() or collect(). - Returns: - Formatted string + Returns + ------- + str + Formatted string. """ lines = ["External Storage Statistics:"] diff --git a/src/datajoint/lineage.py b/src/datajoint/lineage.py index 63a2d675b..d40ed8dd8 100644 --- a/src/datajoint/lineage.py +++ b/src/datajoint/lineage.py @@ -31,8 +31,12 @@ def ensure_lineage_table(connection, database): """ Create the ~lineage table in the schema if it doesn't exist. - :param connection: A DataJoint connection object - :param database: The schema/database name + Parameters + ---------- + connection : Connection + A DataJoint connection object. + database : str + The schema/database name. """ connection.query( """ @@ -50,9 +54,17 @@ def lineage_table_exists(connection, database): """ Check if the ~lineage table exists in the schema. - :param connection: A DataJoint connection object - :param database: The schema/database name - :return: True if the table exists, False otherwise + Parameters + ---------- + connection : Connection + A DataJoint connection object. + database : str + The schema/database name. + + Returns + ------- + bool + True if the table exists, False otherwise. """ result = connection.query( """ @@ -68,11 +80,21 @@ def get_lineage(connection, database, table_name, attribute_name): """ Get the lineage for an attribute from the ~lineage table. - :param connection: A DataJoint connection object - :param database: The schema/database name - :param table_name: The table name - :param attribute_name: The attribute name - :return: The lineage string, or None if not found + Parameters + ---------- + connection : Connection + A DataJoint connection object. + database : str + The schema/database name. + table_name : str + The table name. + attribute_name : str + The attribute name. + + Returns + ------- + str or None + The lineage string, or None if not found. """ if not lineage_table_exists(connection, database): return None @@ -91,10 +113,19 @@ def get_table_lineages(connection, database, table_name): """ Get all lineages for a table from the ~lineage table. - :param connection: A DataJoint connection object - :param database: The schema/database name - :param table_name: The table name - :return: A dict mapping attribute names to lineage strings + Parameters + ---------- + connection : Connection + A DataJoint connection object. + database : str + The schema/database name. + table_name : str + The table name. + + Returns + ------- + dict[str, str] + Dict mapping attribute names to lineage strings. """ if not lineage_table_exists(connection, database): return {} @@ -113,9 +144,17 @@ def get_schema_lineages(connection, database): """ Get all lineages for a schema from the ~lineage table. - :param connection: A DataJoint connection object - :param database: The schema/database name - :return: A dict mapping 'schema.table.attribute' to its lineage + Parameters + ---------- + connection : Connection + A DataJoint connection object. + database : str + The schema/database name. + + Returns + ------- + dict[str, str] + Dict mapping 'schema.table.attribute' to its lineage. """ if not lineage_table_exists(connection, database): return {} @@ -133,9 +172,14 @@ def insert_lineages(connection, database, entries): """ Insert multiple lineage entries in the ~lineage table as a single transaction. - :param connection: A DataJoint connection object - :param database: The schema/database name - :param entries: A list of (table_name, attribute_name, lineage) tuples + Parameters + ---------- + connection : Connection + A DataJoint connection object. + database : str + The schema/database name. + entries : list[tuple[str, str, str]] + List of (table_name, attribute_name, lineage) tuples. """ if not entries: return @@ -158,9 +202,14 @@ def delete_table_lineages(connection, database, table_name): """ Delete all lineage entries for a table. - :param connection: A DataJoint connection object - :param database: The schema/database name - :param table_name: The table name + Parameters + ---------- + connection : Connection + A DataJoint connection object. + database : str + The schema/database name. + table_name : str + The table name. """ if not lineage_table_exists(connection, database): return @@ -187,8 +236,17 @@ def rebuild_schema_lineage(connection, database): If a referenced attribute in another schema has no lineage entry, a DataJointError is raised. - :param connection: A DataJoint connection object - :param database: The schema/database name + Parameters + ---------- + connection : Connection + A DataJoint connection object. + database : str + The schema/database name. + + Raises + ------ + DataJointError + If a referenced attribute in another schema has no lineage entry. """ # Ensure the lineage table exists ensure_lineage_table(connection, database) diff --git a/src/datajoint/migrate.py b/src/datajoint/migrate.py index 7429cc938..3a2bf2ce6 100644 --- a/src/datajoint/migrate.py +++ b/src/datajoint/migrate.py @@ -28,29 +28,36 @@ def analyze_blob_columns(schema: Schema) -> list[dict]: Analyze a schema to find blob columns that could be migrated to . This function identifies blob columns that: + 1. Have a MySQL blob type (tinyblob, blob, mediumblob, longblob) 2. Do NOT already have a codec/type specified in their comment All blob size variants are included in the analysis. - Args: - schema: The DataJoint schema to analyze. + Parameters + ---------- + schema : Schema + The DataJoint schema to analyze. - Returns: + Returns + ------- + list[dict] List of dicts with keys: - - table_name: Full table name (database.table) - - column_name: Name of the blob column - - column_type: MySQL column type (tinyblob, blob, mediumblob, longblob) - - current_comment: Current column comment - - needs_migration: True if column should be migrated - - Example: - >>> import datajoint as dj - >>> schema = dj.schema('my_database') - >>> columns = dj.migrate.analyze_blob_columns(schema) - >>> for col in columns: - ... if col['needs_migration']: - ... print(f"{col['table_name']}.{col['column_name']} ({col['column_type']})") + + - table_name: Full table name (database.table) + - column_name: Name of the blob column + - column_type: MySQL column type (tinyblob, blob, mediumblob, longblob) + - current_comment: Current column comment + - needs_migration: True if column should be migrated + + Examples + -------- + >>> import datajoint as dj + >>> schema = dj.schema('my_database') + >>> columns = dj.migrate.analyze_blob_columns(schema) + >>> for col in columns: + ... if col['needs_migration']: + ... print(f"{col['table_name']}.{col['column_name']} ({col['column_type']})") """ results = [] @@ -108,23 +115,31 @@ def generate_migration_sql( include the `::` prefix, marking them as using explicit DataJoint blob serialization. - Args: - schema: The DataJoint schema to migrate. - target_type: The type name to migrate to (default: "blob"). - dry_run: If True, only return SQL without executing. - - Returns: + Parameters + ---------- + schema : Schema + The DataJoint schema to migrate. + target_type : str, optional + The type name to migrate to. Default "blob". + dry_run : bool, optional + If True, only return SQL without executing. + + Returns + ------- + list[str] List of SQL ALTER TABLE statements. - Example: - >>> sql_statements = dj.migrate.generate_migration_sql(schema) - >>> for sql in sql_statements: - ... print(sql) - - Note: - This is a metadata-only migration. The actual blob data format - remains unchanged - only the column comments are updated to - indicate explicit type handling. + Examples + -------- + >>> sql_statements = dj.migrate.generate_migration_sql(schema) + >>> for sql in sql_statements: + ... print(sql) + + Notes + ----- + This is a metadata-only migration. The actual blob data format + remains unchanged - only the column comments are updated to + indicate explicit type handling. """ columns = analyze_blob_columns(schema) sql_statements = [] @@ -165,31 +180,40 @@ def migrate_blob_columns( This updates column comments in the database to include the type declaration. The data format remains unchanged. - Args: - schema: The DataJoint schema to migrate. - target_type: The type name to migrate to (default: "blob"). - dry_run: If True, only preview changes without applying. - - Returns: + Parameters + ---------- + schema : Schema + The DataJoint schema to migrate. + target_type : str, optional + The type name to migrate to. Default "blob". + dry_run : bool, optional + If True, only preview changes without applying. Default True. + + Returns + ------- + dict Dict with keys: - - analyzed: Number of blob columns analyzed - - needs_migration: Number of columns that need migration - - migrated: Number of columns migrated (0 if dry_run) - - sql_statements: List of SQL statements (executed or to be executed) - - Example: - >>> # Preview migration - >>> result = dj.migrate.migrate_blob_columns(schema, dry_run=True) - >>> print(f"Would migrate {result['needs_migration']} columns") - - >>> # Apply migration - >>> result = dj.migrate.migrate_blob_columns(schema, dry_run=False) - >>> print(f"Migrated {result['migrated']} columns") - - Warning: - After migration, table definitions should be updated to use - `` instead of `longblob` for consistency. The migration - only updates database metadata; source code changes are manual. + + - analyzed: Number of blob columns analyzed + - needs_migration: Number of columns that need migration + - migrated: Number of columns migrated (0 if dry_run) + - sql_statements: List of SQL statements (executed or to be executed) + + Examples + -------- + >>> # Preview migration + >>> result = dj.migrate.migrate_blob_columns(schema, dry_run=True) + >>> print(f"Would migrate {result['needs_migration']} columns") + + >>> # Apply migration + >>> result = dj.migrate.migrate_blob_columns(schema, dry_run=False) + >>> print(f"Migrated {result['migrated']} columns") + + Warnings + -------- + After migration, table definitions should be updated to use + ```` instead of ``longblob`` for consistency. The migration + only updates database metadata; source code changes are manual. """ columns = analyze_blob_columns(schema) sql_statements = generate_migration_sql(schema, target_type=target_type) @@ -226,19 +250,25 @@ def check_migration_status(schema: Schema) -> dict: """ Check the migration status of blob columns in a schema. - Args: - schema: The DataJoint schema to check. + Parameters + ---------- + schema : Schema + The DataJoint schema to check. - Returns: + Returns + ------- + dict Dict with keys: - - total_blob_columns: Total number of blob columns - - migrated: Number of columns with explicit type - - pending: Number of columns using implicit serialization - - columns: List of column details - - Example: - >>> status = dj.migrate.check_migration_status(schema) - >>> print(f"Migration progress: {status['migrated']}/{status['total_blob_columns']}") + + - total_blob_columns: Total number of blob columns + - migrated: Number of columns with explicit type + - pending: Number of columns using implicit serialization + - columns: List of column details + + Examples + -------- + >>> status = dj.migrate.check_migration_status(schema) + >>> print(f"Migration progress: {status['migrated']}/{status['total_blob_columns']}") """ columns = analyze_blob_columns(schema) @@ -296,36 +326,44 @@ def add_job_metadata_columns(target, dry_run: bool = True) -> dict: _job_version) to tables that were created before config.jobs.add_job_metadata was enabled. - Args: - target: Either a table class/instance (dj.Computed or dj.Imported) or - a Schema object. If a Schema, all Computed/Imported tables in - the schema will be processed. - dry_run: If True (default), only preview changes without applying. - - Returns: + Parameters + ---------- + target : Table or Schema + Either a table class/instance (dj.Computed or dj.Imported) or + a Schema object. If a Schema, all Computed/Imported tables in + the schema will be processed. + dry_run : bool, optional + If True, only preview changes without applying. Default True. + + Returns + ------- + dict Dict with keys: - - tables_analyzed: Number of tables checked - - tables_modified: Number of tables that were/would be modified - - columns_added: Total columns added across all tables - - details: List of dicts with per-table information - - Example: - >>> import datajoint as dj - >>> from datajoint.migrate import add_job_metadata_columns - >>> - >>> # Preview migration for a single table - >>> result = add_job_metadata_columns(MyComputedTable, dry_run=True) - >>> print(f"Would add {result['columns_added']} columns") - >>> - >>> # Apply migration to all tables in a schema - >>> result = add_job_metadata_columns(schema, dry_run=False) - >>> print(f"Modified {result['tables_modified']} tables") - - Note: - - Only Computed and Imported tables are modified (not Manual, Lookup, or Part tables) - - Existing rows will have NULL values for _job_start_time and _job_duration - - Future populate() calls will fill in metadata for new rows - - This does NOT retroactively populate metadata for existing rows + + - tables_analyzed: Number of tables checked + - tables_modified: Number of tables that were/would be modified + - columns_added: Total columns added across all tables + - details: List of dicts with per-table information + + Examples + -------- + >>> import datajoint as dj + >>> from datajoint.migrate import add_job_metadata_columns + >>> + >>> # Preview migration for a single table + >>> result = add_job_metadata_columns(MyComputedTable, dry_run=True) + >>> print(f"Would add {result['columns_added']} columns") + >>> + >>> # Apply migration to all tables in a schema + >>> result = add_job_metadata_columns(schema, dry_run=False) + >>> print(f"Modified {result['tables_modified']} tables") + + Notes + ----- + - Only Computed and Imported tables are modified (not Manual, Lookup, or Part) + - Existing rows will have NULL values for _job_start_time and _job_duration + - Future populate() calls will fill in metadata for new rows + - This does NOT retroactively populate metadata for existing rows """ from .schemas import Schema from .table import Table diff --git a/src/datajoint/objectref.py b/src/datajoint/objectref.py index 60cb01af6..9a049b2cf 100644 --- a/src/datajoint/objectref.py +++ b/src/datajoint/objectref.py @@ -66,12 +66,17 @@ def from_json(cls, json_data: dict | str, backend: StorageBackend | None = None) """ Create an ObjectRef from JSON metadata stored in the database. - Args: - json_data: JSON string or dict containing object metadata - backend: StorageBackend instance for file operations - - Returns: - ObjectRef instance + Parameters + ---------- + json_data : dict or str + JSON string or dict containing object metadata. + backend : StorageBackend, optional + StorageBackend instance for file operations. + + Returns + ------- + ObjectRef + ObjectRef instance. """ if isinstance(json_data, str): data = json.loads(json_data) @@ -100,8 +105,10 @@ def to_json(self) -> dict: """ Convert ObjectRef to JSON-serializable dict for database storage. - Returns: - Dict suitable for JSON serialization + Returns + ------- + dict + Dict suitable for JSON serialization. """ data = { "path": self.path, @@ -129,18 +136,21 @@ def to_dict(self) -> dict: any storage backend operations. The returned dict matches the JSON structure stored in the database. - Returns: + Returns + ------- + dict Dict containing the object metadata: - - path: Relative storage path within the store - - url: Full URI (e.g., 's3://bucket/path') (optional) - - store: Store name (optional, None for default store) - - size: File/folder size in bytes (or None) - - hash: Content hash (or None) - - ext: File extension (or None) - - is_dir: True if folder - - timestamp: Upload timestamp - - mime_type: MIME type (files only, optional) - - item_count: Number of files (folders only, optional) + + - path: Relative storage path within the store + - url: Full URI (e.g., 's3://bucket/path') (optional) + - store: Store name (optional, None for default store) + - size: File/folder size in bytes (or None) + - hash: Content hash (or None) + - ext: File extension (or None) + - is_dir: True if folder + - timestamp: Upload timestamp + - mime_type: MIME type (files only, optional) + - item_count: Number of files (folders only, optional) """ return self.to_json() @@ -205,11 +215,15 @@ def read(self) -> bytes: """ Read entire file content as bytes. - Returns: - File contents as bytes + Returns + ------- + bytes + File contents as bytes. - Raises: - DataJointError: If object is a directory + Raises + ------ + DataJointError + If object is a directory. """ if self.is_dir: raise DataJointError("Cannot read() a directory. Use listdir() or walk() instead.") @@ -220,12 +234,17 @@ def open(self, subpath: str | None = None, mode: str = "rb") -> IO: """ Open file for reading. - Args: - subpath: Optional path within directory (for folder objects) - mode: File mode ('rb' for binary read, 'r' for text) - - Returns: - File-like object + Parameters + ---------- + subpath : str, optional + Path within directory (for folder objects). + mode : str, optional + File mode ('rb' for binary read, 'r' for text). Default 'rb'. + + Returns + ------- + IO + File-like object. """ self._ensure_backend() path = self.path @@ -239,11 +258,15 @@ def listdir(self, subpath: str = "") -> list[str]: """ List contents of directory. - Args: - subpath: Optional subdirectory path + Parameters + ---------- + subpath : str, optional + Subdirectory path. Default empty string (root). - Returns: - List of filenames/directory names + Returns + ------- + list[str] + List of filenames/directory names. """ if not self.is_dir: raise DataJointError("Cannot listdir() on a file. Use read() or open() instead.") @@ -258,8 +281,10 @@ def walk(self) -> Iterator[tuple[str, list[str], list[str]]]: """ Walk directory tree, similar to os.walk(). - Yields: - Tuples of (dirpath, dirnames, filenames) + Yields + ------ + tuple[str, list[str], list[str]] + Tuples of (dirpath, dirnames, filenames). """ if not self.is_dir: raise DataJointError("Cannot walk() on a file.") @@ -274,12 +299,17 @@ def download(self, destination: Path | str, subpath: str | None = None) -> Path: """ Download object to local filesystem. - Args: - destination: Local directory or file path - subpath: Optional path within directory (for folder objects) - - Returns: - Path to downloaded file/directory + Parameters + ---------- + destination : Path or str + Local directory or file path. + subpath : str, optional + Path within directory (for folder objects). + + Returns + ------- + Path + Path to downloaded file/directory. """ self._ensure_backend() destination = Path(destination) @@ -310,11 +340,15 @@ def exists(self, subpath: str | None = None) -> bool: """ Check if object (or subpath within it) exists. - Args: - subpath: Optional path within directory + Parameters + ---------- + subpath : str, optional + Path within directory. - Returns: - True if exists + Returns + ------- + bool + True if exists. """ self._ensure_backend() path = f"{self.path}/{subpath}" if subpath else self.path @@ -327,11 +361,15 @@ def verify(self) -> bool: For files: checks size matches, and hash if available. For folders: validates manifest (all files exist with correct sizes). - Returns: - True if valid + Returns + ------- + bool + True if valid. - Raises: - IntegrityError: If verification fails with details + Raises + ------ + IntegrityError + If verification fails with details. """ self._ensure_backend() From ff04914c0623d9ff27939e7f399a783949c092b4 Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Sun, 4 Jan 2026 13:59:51 -0600 Subject: [PATCH 05/15] docs: Update README and change license to Apache 2.0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Improve README opening with clear DataJoint definition - Add key features list - Simplify badge table layout - Change license from LGPL-2.1 to Apache 2.0 - Add new LICENSE file with Apache 2.0 text - Copyright 2024 DataJoint Inc. and contributors πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- LICENSE | 190 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ README.md | 104 ++++++++---------------------- 2 files changed, 216 insertions(+), 78 deletions(-) create mode 100644 LICENSE diff --git a/LICENSE b/LICENSE new file mode 100644 index 000000000..4cdf770f0 --- /dev/null +++ b/LICENSE @@ -0,0 +1,190 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to the Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + Copyright 2024 DataJoint Inc. and contributors + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.md b/README.md index f4a6f8352..c6d60b75b 100644 --- a/README.md +++ b/README.md @@ -1,116 +1,64 @@ -# Welcome to DataJoint for Python! +# DataJoint for Python + +DataJoint is an open-source Python framework for building scientific data pipelines. +It implements the **Relational Workflow Model**β€”a paradigm that extends relational +databases with native support for computational workflows. + +**Key Features:** + +- **Declarative schema design** β€” Define tables and relationships in Python +- **Automatic dependency tracking** β€” Foreign keys encode workflow dependencies +- **Built-in computation** β€” Imported and Computed tables run automatically +- **Data integrity** β€” Referential integrity and transaction support +- **Reproducibility** β€” Immutable data with full provenance + +**Documentation:** https://docs.datajoint.com - - - - - + - - - - - + - - - - - - - - - - - - - - + + - - - - -
PyPI pypi release -
- - pypi downloads -
Conda ForgeConda conda-forge release -
- - conda-forge downloads -
Since Release - - commit since last release - -
Test StatusTests test status
Release Status - - release status - -
Doc Status - - doc status - -
Coverage - coverage + coverage
Developer Chat - - datajoint slack - -
License - - LGPL-2.1 + + Apache-2.0 + + Citation + + DOI
Citation - - bioRxiv - -
- - zenodo - -
-DataJoint for Python is a framework for scientific workflow management based on -relational principles. DataJoint is built on the foundation of the relational data -model and prescribes a consistent method for organizing, populating, computing, and -querying data. - -DataJoint was initially developed in 2009 by Dimitri Yatsenko in Andreas Tolias' Lab at -Baylor College of Medicine for the distributed processing and management of large -volumes of data streaming from regular experiments. Starting in 2011, DataJoint has -been available as an open-source project adopted by other labs and improved through -contributions from several developers. -Presently, the primary developer of DataJoint open-source software is the company -DataJoint (). - ## Data Pipeline Example ![pipeline](https://raw.githubusercontent.com/datajoint/datajoint-python/master/images/pipeline.png) From 7c7e404133abd850b48d4d6b1f85470f23914733 Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Sun, 4 Jan 2026 14:27:29 -0600 Subject: [PATCH 06/15] docs: Restructure to developer-focused documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Update index.md to point users to docs.datajoint.com - Create architecture/ section with transpilation docs - Archive user-facing content (now in datajoint-docs) - Simplify mkdocs.yaml for developer documentation - Update docstring style to numpy in mkdocs config User documentation is now at docs.datajoint.com. This site focuses on contributing, architecture, and API reference. πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- DOCSTRING_STYLE.md | 499 ++++++++++++++++++ docs/mkdocs.yaml | 127 +---- docs/src/architecture/index.md | 34 ++ .../transpilation.md | 0 docs/src/{ => archive}/citation.md | 0 docs/src/{ => archive}/client/credentials.md | 0 docs/src/{ => archive}/client/install.md | 0 docs/src/{ => archive}/client/settings.md | 0 .../compute/autopopulate2.0-spec.md | 0 docs/src/{ => archive}/compute/distributed.md | 0 docs/src/{ => archive}/compute/key-source.md | 0 docs/src/{ => archive}/compute/make.md | 0 docs/src/{ => archive}/compute/populate.md | 0 docs/src/{ => archive}/concepts/data-model.md | 0 .../{ => archive}/concepts/data-pipelines.md | 0 docs/src/{ => archive}/concepts/principles.md | 0 docs/src/{ => archive}/concepts/teamwork.md | 0 .../src/{ => archive}/concepts/terminology.md | 0 docs/src/{ => archive}/design/alter.md | 0 docs/src/{ => archive}/design/diagrams.md | 0 docs/src/{ => archive}/design/drop.md | 0 .../design/fetch-api-2.0-spec.md | 0 .../design/hidden-job-metadata-spec.md | 0 docs/src/{ => archive}/design/integrity.md | 0 .../src/{ => archive}/design/normalization.md | 0 .../src/{ => archive}/design/pk-rules-spec.md | 0 docs/src/{ => archive}/design/recall.md | 0 docs/src/{ => archive}/design/schema.md | 0 .../design/semantic-matching-spec.md | 0 .../src/{ => archive}/design/tables/attach.md | 0 .../{ => archive}/design/tables/attributes.md | 0 docs/src/{ => archive}/design/tables/blobs.md | 0 .../{ => archive}/design/tables/codec-spec.md | 0 .../src/{ => archive}/design/tables/codecs.md | 0 .../{ => archive}/design/tables/declare.md | 0 .../design/tables/dependencies.md | 0 .../{ => archive}/design/tables/filepath.md | 0 .../{ => archive}/design/tables/indexes.md | 0 .../src/{ => archive}/design/tables/lookup.md | 0 .../src/{ => archive}/design/tables/manual.md | 0 .../design/tables/master-part.md | 0 .../src/{ => archive}/design/tables/object.md | 0 .../{ => archive}/design/tables/primary.md | 0 .../design/tables/storage-types-spec.md | 0 docs/src/{ => archive}/design/tables/tiers.md | 0 docs/src/{ => archive}/faq.md | 0 .../src/{ => archive}/images/StudentTable.png | Bin .../images/added-example-ERD.svg | 0 .../{ => archive}/images/data-engineering.png | Bin .../images/data-science-after.png | Bin .../images/data-science-before.png | Bin .../{ => archive}/images/diff-example1.png | Bin .../{ => archive}/images/diff-example2.png | Bin .../{ => archive}/images/diff-example3.png | Bin docs/src/{ => archive}/images/dimitri-ERD.svg | 0 docs/src/{ => archive}/images/doc_1-1.png | Bin docs/src/{ => archive}/images/doc_1-many.png | Bin docs/src/{ => archive}/images/doc_many-1.png | Bin .../{ => archive}/images/doc_many-many.png | Bin .../src/{ => archive}/images/how-it-works.png | Bin .../images/install-cmd-prompt.png | Bin .../images/install-datajoint-1.png | Bin .../images/install-datajoint-2.png | Bin .../{ => archive}/images/install-git-1.png | Bin .../images/install-graphviz-1.png | Bin .../images/install-graphviz-2a.png | Bin .../images/install-graphviz-2b.png | Bin .../images/install-jupyter-1.png | Bin .../images/install-jupyter-2.png | Bin .../images/install-matplotlib.png | Bin .../images/install-pydotplus.png | Bin .../images/install-python-advanced-1.png | Bin .../images/install-python-advanced-2.png | Bin .../images/install-python-simple.png | Bin .../images/install-run-jupyter-1.png | Bin .../images/install-run-jupyter-2.png | Bin .../images/install-verify-graphviz.png | Bin .../images/install-verify-jupyter.png | Bin .../images/install-verify-python.png | Bin .../{ => archive}/images/join-example1.png | Bin .../{ => archive}/images/join-example2.png | Bin .../{ => archive}/images/join-example3.png | Bin .../images/key_source_combination.png | Bin .../src/{ => archive}/images/map-dataflow.png | Bin .../{ => archive}/images/matched_tuples1.png | Bin .../{ => archive}/images/matched_tuples2.png | Bin .../{ => archive}/images/matched_tuples3.png | Bin docs/src/{ => archive}/images/mp-diagram.png | Bin docs/src/{ => archive}/images/op-restrict.png | Bin .../{ => archive}/images/outer-example1.png | Bin .../images/pipeline-database.png | Bin docs/src/{ => archive}/images/pipeline.png | Bin .../images/python_collection.png | Bin .../images/queries_example_diagram.png | Bin .../images/query_object_preview.png | Bin .../images/restrict-example1.png | Bin .../images/restrict-example2.png | Bin .../images/restrict-example3.png | Bin .../{ => archive}/images/shapes_pipeline.svg | 0 .../images/spawned-classes-ERD.svg | 0 .../{ => archive}/images/union-example1.png | Bin .../{ => archive}/images/union-example2.png | Bin .../images/virtual-module-ERD.svg | 0 docs/src/{ => archive}/manipulation/delete.md | 0 docs/src/{ => archive}/manipulation/index.md | 0 docs/src/{ => archive}/manipulation/insert.md | 0 .../manipulation/transactions.md | 0 docs/src/{ => archive}/manipulation/update.md | 0 docs/src/{ => archive}/publish-data.md | 0 docs/src/{ => archive}/query/aggregation.md | 0 .../src/{ => archive}/query/example-schema.md | 0 docs/src/{ => archive}/query/fetch.md | 0 docs/src/{ => archive}/query/iteration.md | 0 docs/src/{ => archive}/query/join.md | 0 docs/src/{ => archive}/query/operators.md | 0 docs/src/{ => archive}/query/principles.md | 0 docs/src/{ => archive}/query/project.md | 0 docs/src/{ => archive}/query/query-caching.md | 0 docs/src/{ => archive}/query/restrict.md | 0 docs/src/{ => archive}/query/union.md | 0 docs/src/{ => archive}/query/universals.md | 0 docs/src/{ => archive}/quick-start.md | 0 .../{ => archive}/sysadmin/bulk-storage.md | 0 .../{ => archive}/sysadmin/database-admin.md | 0 .../{ => archive}/sysadmin/external-store.md | 0 docs/src/{ => archive}/tutorials/dj-top.ipynb | 0 docs/src/{ => archive}/tutorials/json.ipynb | 0 docs/src/index.md | 61 +-- 128 files changed, 582 insertions(+), 139 deletions(-) create mode 100644 DOCSTRING_STYLE.md create mode 100644 docs/src/architecture/index.md rename docs/src/{internal => architecture}/transpilation.md (100%) rename docs/src/{ => archive}/citation.md (100%) rename docs/src/{ => archive}/client/credentials.md (100%) rename docs/src/{ => archive}/client/install.md (100%) rename docs/src/{ => archive}/client/settings.md (100%) rename docs/src/{ => archive}/compute/autopopulate2.0-spec.md (100%) rename docs/src/{ => archive}/compute/distributed.md (100%) rename docs/src/{ => archive}/compute/key-source.md (100%) rename docs/src/{ => archive}/compute/make.md (100%) rename docs/src/{ => archive}/compute/populate.md (100%) rename docs/src/{ => archive}/concepts/data-model.md (100%) rename docs/src/{ => archive}/concepts/data-pipelines.md (100%) rename docs/src/{ => archive}/concepts/principles.md (100%) rename docs/src/{ => archive}/concepts/teamwork.md (100%) rename docs/src/{ => archive}/concepts/terminology.md (100%) rename docs/src/{ => archive}/design/alter.md (100%) rename docs/src/{ => archive}/design/diagrams.md (100%) rename docs/src/{ => archive}/design/drop.md (100%) rename docs/src/{ => archive}/design/fetch-api-2.0-spec.md (100%) rename docs/src/{ => archive}/design/hidden-job-metadata-spec.md (100%) rename docs/src/{ => archive}/design/integrity.md (100%) rename docs/src/{ => archive}/design/normalization.md (100%) rename docs/src/{ => archive}/design/pk-rules-spec.md (100%) rename docs/src/{ => archive}/design/recall.md (100%) rename docs/src/{ => archive}/design/schema.md (100%) rename docs/src/{ => archive}/design/semantic-matching-spec.md (100%) rename docs/src/{ => archive}/design/tables/attach.md (100%) rename docs/src/{ => archive}/design/tables/attributes.md (100%) rename docs/src/{ => archive}/design/tables/blobs.md (100%) rename docs/src/{ => archive}/design/tables/codec-spec.md (100%) rename docs/src/{ => archive}/design/tables/codecs.md (100%) rename docs/src/{ => archive}/design/tables/declare.md (100%) rename docs/src/{ => archive}/design/tables/dependencies.md (100%) rename docs/src/{ => archive}/design/tables/filepath.md (100%) rename docs/src/{ => archive}/design/tables/indexes.md (100%) rename docs/src/{ => archive}/design/tables/lookup.md (100%) rename docs/src/{ => archive}/design/tables/manual.md (100%) rename docs/src/{ => archive}/design/tables/master-part.md (100%) rename docs/src/{ => archive}/design/tables/object.md (100%) rename docs/src/{ => archive}/design/tables/primary.md (100%) rename docs/src/{ => archive}/design/tables/storage-types-spec.md (100%) rename docs/src/{ => archive}/design/tables/tiers.md (100%) rename docs/src/{ => archive}/faq.md (100%) rename docs/src/{ => archive}/images/StudentTable.png (100%) rename docs/src/{ => archive}/images/added-example-ERD.svg (100%) rename docs/src/{ => archive}/images/data-engineering.png (100%) rename docs/src/{ => archive}/images/data-science-after.png (100%) rename docs/src/{ => archive}/images/data-science-before.png (100%) rename docs/src/{ => archive}/images/diff-example1.png (100%) rename docs/src/{ => archive}/images/diff-example2.png (100%) rename docs/src/{ => archive}/images/diff-example3.png (100%) rename docs/src/{ => archive}/images/dimitri-ERD.svg (100%) rename docs/src/{ => archive}/images/doc_1-1.png (100%) rename docs/src/{ => archive}/images/doc_1-many.png (100%) rename docs/src/{ => archive}/images/doc_many-1.png (100%) rename docs/src/{ => archive}/images/doc_many-many.png (100%) rename docs/src/{ => archive}/images/how-it-works.png (100%) rename docs/src/{ => archive}/images/install-cmd-prompt.png (100%) rename docs/src/{ => archive}/images/install-datajoint-1.png (100%) rename docs/src/{ => archive}/images/install-datajoint-2.png (100%) rename docs/src/{ => archive}/images/install-git-1.png (100%) rename docs/src/{ => archive}/images/install-graphviz-1.png (100%) rename docs/src/{ => archive}/images/install-graphviz-2a.png (100%) rename docs/src/{ => archive}/images/install-graphviz-2b.png (100%) rename docs/src/{ => archive}/images/install-jupyter-1.png (100%) rename docs/src/{ => archive}/images/install-jupyter-2.png (100%) rename docs/src/{ => archive}/images/install-matplotlib.png (100%) rename docs/src/{ => archive}/images/install-pydotplus.png (100%) rename docs/src/{ => archive}/images/install-python-advanced-1.png (100%) rename docs/src/{ => archive}/images/install-python-advanced-2.png (100%) rename docs/src/{ => archive}/images/install-python-simple.png (100%) rename docs/src/{ => archive}/images/install-run-jupyter-1.png (100%) rename docs/src/{ => archive}/images/install-run-jupyter-2.png (100%) rename docs/src/{ => archive}/images/install-verify-graphviz.png (100%) rename docs/src/{ => archive}/images/install-verify-jupyter.png (100%) rename docs/src/{ => archive}/images/install-verify-python.png (100%) rename docs/src/{ => archive}/images/join-example1.png (100%) rename docs/src/{ => archive}/images/join-example2.png (100%) rename docs/src/{ => archive}/images/join-example3.png (100%) rename docs/src/{ => archive}/images/key_source_combination.png (100%) rename docs/src/{ => archive}/images/map-dataflow.png (100%) rename docs/src/{ => archive}/images/matched_tuples1.png (100%) rename docs/src/{ => archive}/images/matched_tuples2.png (100%) rename docs/src/{ => archive}/images/matched_tuples3.png (100%) rename docs/src/{ => archive}/images/mp-diagram.png (100%) rename docs/src/{ => archive}/images/op-restrict.png (100%) rename docs/src/{ => archive}/images/outer-example1.png (100%) rename docs/src/{ => archive}/images/pipeline-database.png (100%) rename docs/src/{ => archive}/images/pipeline.png (100%) rename docs/src/{ => archive}/images/python_collection.png (100%) rename docs/src/{ => archive}/images/queries_example_diagram.png (100%) rename docs/src/{ => archive}/images/query_object_preview.png (100%) rename docs/src/{ => archive}/images/restrict-example1.png (100%) rename docs/src/{ => archive}/images/restrict-example2.png (100%) rename docs/src/{ => archive}/images/restrict-example3.png (100%) rename docs/src/{ => archive}/images/shapes_pipeline.svg (100%) rename docs/src/{ => archive}/images/spawned-classes-ERD.svg (100%) rename docs/src/{ => archive}/images/union-example1.png (100%) rename docs/src/{ => archive}/images/union-example2.png (100%) rename docs/src/{ => archive}/images/virtual-module-ERD.svg (100%) rename docs/src/{ => archive}/manipulation/delete.md (100%) rename docs/src/{ => archive}/manipulation/index.md (100%) rename docs/src/{ => archive}/manipulation/insert.md (100%) rename docs/src/{ => archive}/manipulation/transactions.md (100%) rename docs/src/{ => archive}/manipulation/update.md (100%) rename docs/src/{ => archive}/publish-data.md (100%) rename docs/src/{ => archive}/query/aggregation.md (100%) rename docs/src/{ => archive}/query/example-schema.md (100%) rename docs/src/{ => archive}/query/fetch.md (100%) rename docs/src/{ => archive}/query/iteration.md (100%) rename docs/src/{ => archive}/query/join.md (100%) rename docs/src/{ => archive}/query/operators.md (100%) rename docs/src/{ => archive}/query/principles.md (100%) rename docs/src/{ => archive}/query/project.md (100%) rename docs/src/{ => archive}/query/query-caching.md (100%) rename docs/src/{ => archive}/query/restrict.md (100%) rename docs/src/{ => archive}/query/union.md (100%) rename docs/src/{ => archive}/query/universals.md (100%) rename docs/src/{ => archive}/quick-start.md (100%) rename docs/src/{ => archive}/sysadmin/bulk-storage.md (100%) rename docs/src/{ => archive}/sysadmin/database-admin.md (100%) rename docs/src/{ => archive}/sysadmin/external-store.md (100%) rename docs/src/{ => archive}/tutorials/dj-top.ipynb (100%) rename docs/src/{ => archive}/tutorials/json.ipynb (100%) diff --git a/DOCSTRING_STYLE.md b/DOCSTRING_STYLE.md new file mode 100644 index 000000000..77b6dc90a --- /dev/null +++ b/DOCSTRING_STYLE.md @@ -0,0 +1,499 @@ +# DataJoint Python Docstring Style Guide + +This document defines the canonical docstring format for datajoint-python. +All public APIs must follow this NumPy-style format for consistency and +automated documentation generation via mkdocstrings. + +## Quick Reference + +```python +def function(param1, param2, *, keyword_only=None): + """ + Short one-line summary (imperative mood, no period). + + Extended description providing context and details. May span + multiple lines. Explain what the function does, not how. + + Parameters + ---------- + param1 : type + Description of param1. + param2 : type + Description of param2. + keyword_only : type, optional + Description. Default is None. + + Returns + ------- + type + Description of return value. + + Raises + ------ + ExceptionType + When and why this exception is raised. + + Examples + -------- + >>> result = function("value", 42) + >>> print(result) + expected_output + + See Also + -------- + related_function : Brief description. + + Notes + ----- + Additional technical notes, algorithms, or implementation details. + """ +``` + +--- + +## Module Docstrings + +Every module must begin with a docstring explaining its purpose. + +```python +""" +Connection management for DataJoint. + +This module provides the Connection class that manages database connections, +transaction handling, and query execution. It also provides the ``conn()`` +function for accessing a persistent shared connection. + +Key Components +-------------- +Connection : class + Manages a single database connection with transaction support. +conn : function + Returns a persistent connection object shared across modules. + +Example +------- +>>> import datajoint as dj +>>> connection = dj.conn() +>>> connection.query("SHOW DATABASES") +""" +``` + +--- + +## Class Docstrings + +```python +class Table(QueryExpression): + """ + Base class for all DataJoint tables. + + Table implements data manipulation (insert, delete, update) and inherits + query functionality from QueryExpression. Concrete table classes must + define the ``definition`` property specifying the table structure. + + Parameters + ---------- + None + Tables are typically instantiated via schema decoration, not directly. + + Attributes + ---------- + definition : str + DataJoint table definition string (DDL). + primary_key : list of str + Names of primary key attributes. + heading : Heading + Table heading with attribute metadata. + + Examples + -------- + Define a table using the schema decorator: + + >>> @schema + ... class Mouse(dj.Manual): + ... definition = ''' + ... mouse_id : int + ... --- + ... dob : date + ... sex : enum("M", "F", "U") + ... ''' + + Insert data: + + >>> Mouse.insert1({"mouse_id": 1, "dob": "2024-01-15", "sex": "M"}) + + See Also + -------- + Manual : Table for manually entered data. + Computed : Table for computed results. + QueryExpression : Query operator base class. + """ +``` + +--- + +## Method Docstrings + +### Standard Method + +```python +def insert(self, rows, *, replace=False, skip_duplicates=False, ignore_extra_fields=False): + """ + Insert one or more rows into the table. + + Parameters + ---------- + rows : iterable + Rows to insert. Each row can be: + - dict: ``{"attr": value, ...}`` + - numpy.void: Record array element + - sequence: Values in heading order + - QueryExpression: Results of a query + - pathlib.Path: Path to CSV file + replace : bool, optional + If True, replace existing rows with matching primary keys. + Default is False. + skip_duplicates : bool, optional + If True, silently skip rows that would cause duplicate key errors. + Default is False. + ignore_extra_fields : bool, optional + If True, ignore fields not in the table heading. + Default is False. + + Returns + ------- + None + + Raises + ------ + DuplicateError + When inserting a row with an existing primary key and neither + ``replace`` nor ``skip_duplicates`` is True. + DataJointError + When required attributes are missing or types are incompatible. + + Examples + -------- + Insert a single row: + + >>> Mouse.insert1({"mouse_id": 1, "dob": "2024-01-15", "sex": "M"}) + + Insert multiple rows: + + >>> Mouse.insert([ + ... {"mouse_id": 2, "dob": "2024-02-01", "sex": "F"}, + ... {"mouse_id": 3, "dob": "2024-02-15", "sex": "M"}, + ... ]) + + Insert from a query: + + >>> TargetTable.insert(SourceTable & "condition > 5") + + See Also + -------- + insert1 : Insert exactly one row. + """ +``` + +### Method with Complex Return + +```python +def fetch(self, *attrs, offset=None, limit=None, order_by=None, format=None, as_dict=False): + """ + Retrieve data from the table. + + Parameters + ---------- + *attrs : str + Attribute names to fetch. If empty, fetches all attributes. + Use "KEY" to fetch primary key as dict. + offset : int, optional + Number of rows to skip. Default is None (no offset). + limit : int, optional + Maximum number of rows to return. Default is None (no limit). + order_by : str or list of str, optional + Attribute(s) to sort by. Use "KEY" for primary key order, + append " DESC" for descending. Default is None (unordered). + format : {"array", "frame"}, optional + Output format when fetching all attributes: + - "array": numpy structured array (default) + - "frame": pandas DataFrame + as_dict : bool, optional + If True, return list of dicts instead of structured array. + Default is False. + + Returns + ------- + numpy.ndarray or list of dict or pandas.DataFrame + Query results in the requested format: + - Single attribute: 1D array of values + - Multiple attributes: tuple of 1D arrays + - No attributes specified: structured array, DataFrame, or list of dicts + + Examples + -------- + Fetch all data as structured array: + + >>> data = Mouse.fetch() + + Fetch specific attributes: + + >>> ids, dobs = Mouse.fetch("mouse_id", "dob") + + Fetch as list of dicts: + + >>> rows = Mouse.fetch(as_dict=True) + >>> for row in rows: + ... print(row["mouse_id"]) + + Fetch with ordering and limit: + + >>> recent = Mouse.fetch(order_by="dob DESC", limit=10) + + See Also + -------- + fetch1 : Fetch exactly one row. + head : Fetch first N rows ordered by key. + tail : Fetch last N rows ordered by key. + """ +``` + +### Generator Method + +```python +def make(self, key): + """ + Compute and insert results for one key. + + This method must be implemented by subclasses of Computed or Imported + tables. It is called by ``populate()`` for each key in ``key_source`` + that is not yet in the table. + + The method can be implemented in two ways: + + **Simple mode** (regular method): + Fetch, compute, and insert within a single transaction. + + **Tripartite mode** (generator method): + Split into ``make_fetch``, ``make_compute``, ``make_insert`` for + long-running computations with deferred transactions. + + Parameters + ---------- + key : dict + Primary key values identifying the entity to compute. + + Yields + ------ + tuple + In tripartite mode, yields fetched data and computed results. + + Raises + ------ + NotImplementedError + If neither ``make`` nor the tripartite methods are implemented. + + Examples + -------- + Simple implementation: + + >>> class ProcessedData(dj.Computed): + ... definition = ''' + ... -> RawData + ... --- + ... result : float + ... ''' + ... + ... def make(self, key): + ... raw = (RawData & key).fetch1("data") + ... result = expensive_computation(raw) + ... self.insert1({**key, "result": result}) + + See Also + -------- + populate : Execute make for all pending keys. + key_source : Query defining keys to populate. + """ +``` + +--- + +## Property Docstrings + +```python +@property +def primary_key(self): + """ + list of str : Names of primary key attributes. + + The primary key uniquely identifies each row in the table. + Derived from the table definition. + + Examples + -------- + >>> Mouse.primary_key + ['mouse_id'] + """ + return self.heading.primary_key +``` + +--- + +## Parameter Types + +Use these type annotations in docstrings: + +| Python Type | Docstring Format | +|-------------|------------------| +| `str` | `str` | +| `int` | `int` | +| `float` | `float` | +| `bool` | `bool` | +| `None` | `None` | +| `list` | `list` or `list of str` | +| `dict` | `dict` or `dict[str, int]` | +| `tuple` | `tuple` or `tuple of (str, int)` | +| Optional | `str or None` or `str, optional` | +| Union | `str or int` | +| Literal | `{"option1", "option2"}` | +| Callable | `callable` | +| Class | `ClassName` | +| Any | `object` | + +--- + +## Section Order + +Sections must appear in this order (include only relevant sections): + +1. **Short Summary** (required) - One line, imperative mood +2. **Deprecation Warning** - If applicable +3. **Extended Summary** - Additional context +4. **Parameters** - Input arguments +5. **Returns** / **Yields** - Output values +6. **Raises** - Exceptions +7. **Warns** - Warnings issued +8. **See Also** - Related functions/classes +9. **Notes** - Technical details +10. **References** - Citations +11. **Examples** (strongly encouraged) - Usage demonstrations + +--- + +## Style Rules + +### Do + +- Use imperative mood: "Insert rows" not "Inserts rows" +- Start with capital letter, no period at end of summary +- Document all public methods +- Include at least one example for public APIs +- Use backticks for code: ``parameter``, ``ClassName`` +- Reference related items in See Also + +### Don't + +- Don't document private methods extensively (brief is fine) +- Don't repeat the function signature in the description +- Don't use "This function..." or "This method..." +- Don't include implementation details in Parameters +- Don't use first person ("I", "we") + +--- + +## Examples Section Best Practices + +```python +""" +Examples +-------- +Basic usage: + +>>> table.insert1({"id": 1, "value": 42}) + +With options: + +>>> table.insert(rows, skip_duplicates=True) + +Error handling: + +>>> try: +... table.insert1({"id": 1}) # duplicate +... except dj.errors.DuplicateError: +... print("Already exists") +Already exists +""" +``` + +--- + +## Converting from Sphinx Style + +Replace Sphinx-style docstrings: + +```python +# Before (Sphinx style) +def method(self, param1, param2): + """ + Brief description. + + :param param1: Description of param1. + :type param1: str + :param param2: Description of param2. + :type param2: int + :returns: Description of return value. + :rtype: bool + :raises ValueError: When param1 is empty. + """ + +# After (NumPy style) +def method(self, param1, param2): + """ + Brief description. + + Parameters + ---------- + param1 : str + Description of param1. + param2 : int + Description of param2. + + Returns + ------- + bool + Description of return value. + + Raises + ------ + ValueError + When param1 is empty. + """ +``` + +--- + +## Validation + +Docstrings are validated by: + +1. **mkdocstrings** - Parses for API documentation +2. **ruff** - Linting (D100-D417 rules when enabled) +3. **pytest --doctest-modules** - Executes examples + +Run locally: + +```bash +# Build docs to check parsing +mkdocs build --config-file docs/mkdocs.yaml + +# Check docstring examples +pytest --doctest-modules src/datajoint/ +``` + +--- + +## References + +- [NumPy Docstring Guide](https://numpydoc.readthedocs.io/en/latest/format.html) +- [mkdocstrings Python Handler](https://mkdocstrings.github.io/python/) +- [PEP 257 - Docstring Conventions](https://peps.python.org/pep-0257/) diff --git a/docs/mkdocs.yaml b/docs/mkdocs.yaml index 03c10f69b..554e456cc 100644 --- a/docs/mkdocs.yaml +++ b/docs/mkdocs.yaml @@ -1,82 +1,17 @@ # ---------------------- PROJECT SPECIFIC --------------------------- -site_name: DataJoint Documentation +site_name: DataJoint Python - Developer Documentation +site_description: Developer documentation for DataJoint Python contributors repo_url: https://github.com/datajoint/datajoint-python repo_name: datajoint/datajoint-python nav: - - DataJoint Python: index.md - - Quick Start Guide: quick-start.md - - Concepts: - - Principles: concepts/principles.md - - Data Model: concepts/data-model.md - - Data Pipelines: concepts/data-pipelines.md - - Teamwork: concepts/teamwork.md - - Terminology: concepts/terminology.md - - System Administration: - - Database Administration: sysadmin/database-admin.md - - Bulk Storage Systems: sysadmin/bulk-storage.md - - External Store: sysadmin/external-store.md - - Client Configuration: - - Install: client/install.md - - Credentials: client/credentials.md - - Settings: client/settings.md - - File Stores: client/stores.md - - Schema Design: - - Schema Creation: design/schema.md - - Table Definition: - - Table Tiers: design/tables/tiers.md - - Declaration Syntax: design/tables/declare.md - - Primary Key: design/tables/primary.md - - Attributes: design/tables/attributes.md - - Lookup Tables: design/tables/lookup.md - - Manual Tables: design/tables/manual.md - - Blobs: design/tables/blobs.md - - Attachments: design/tables/attach.md - - Filepaths: design/tables/filepath.md - - Custom Codecs: design/tables/codecs.md - - Dependencies: design/tables/dependencies.md - - Indexes: design/tables/indexes.md - - Master-Part Relationships: design/tables/master-part.md - - Schema Diagrams: design/diagrams.md - - Entity Normalization: design/normalization.md - - Data Integrity: design/integrity.md - - Schema Recall: design/recall.md - - Schema Drop: design/drop.md - - Schema Modification: design/alter.md - - Data Manipulations: - - manipulation/index.md - - Insert: manipulation/insert.md - - Delete: manipulation/delete.md - - Update: manipulation/update.md - - Transactions: manipulation/transactions.md - - Data Queries: - - Principles: query/principles.md - - Example Schema: query/example-schema.md - - Fetch: query/fetch.md - - Iteration: query/iteration.md - - Operators: query/operators.md - - Restrict: query/restrict.md - - Projection: query/project.md - - Join: query/join.md - - Aggregation: query/aggregation.md - - Union: query/union.md - - Universal Sets: query/universals.md - - Query Caching: query/query-caching.md - - Computations: - - Make Method: compute/make.md - - Populate: compute/populate.md - - Key Source: compute/key-source.md - - Distributed Computing: compute/distributed.md - - Publish Data: publish-data.md - - Internals: - - SQL Transpilation: internal/transpilation.md - - Tutorials: - - JSON Datatype: tutorials/json.ipynb - - FAQ: faq.md - - Developer Guide: develop.md - - Citation: citation.md + - Home: index.md + - Contributing: develop.md + - Architecture: + - architecture/index.md + - SQL Transpilation: architecture/transpilation.md - Changelog: changelog.md - - API: api/ # defer to gen-files + literate-nav + - API Reference: api/ # defer to gen-files + literate-nav # ---------------------------- STANDARD ----------------------------- @@ -93,7 +28,7 @@ theme: favicon: assets/images/company-logo-blue.png features: - toc.integrate - - content.code.annotate # Add codeblock annotations + - content.code.annotate palette: - media: "(prefers-color-scheme: light)" scheme: datajoint @@ -113,26 +48,18 @@ plugins: handlers: python: paths: - - "." - - /main/ + - "../src" options: - filters: - - "!^_" - docstring_style: sphinx # Replaces google default pending docstring updates + docstring_style: numpy members_order: source group_by_category: false line_length: 88 + show_source: false - gen-files: scripts: - ./src/api/make_pages.py - literate-nav: nav_file: navigation.md - - exclude-search: - exclude: - - "*/navigation.md" - - "*/archive/*md" - - mkdocs-jupyter: - include: ["*.ipynb"] - section-index markdown_extensions: - attr_list @@ -154,41 +81,23 @@ markdown_extensions: - name: mermaid class: mermaid format: !!python/name:pymdownx.superfences.fence_code_format - - pymdownx.magiclink # Displays bare URLs as links - - pymdownx.tasklist: # Renders check boxes in tasks lists + - pymdownx.magiclink + - pymdownx.tasklist: custom_checkbox: true - md_in_html extra: - generator: false # Disable watermark + generator: false version: provider: mike social: - icon: main/company-logo link: https://www.datajoint.com name: DataJoint - - icon: fontawesome/brands/slack - link: https://datajoint.slack.com - name: Slack - - icon: fontawesome/brands/linkedin - link: https://www.linkedin.com/company/datajoint - name: LinkedIn - - icon: fontawesome/brands/twitter - link: https://twitter.com/datajoint - name: Twitter - icon: fontawesome/brands/github link: https://github.com/datajoint name: GitHub - - icon: fontawesome/brands/docker - link: https://hub.docker.com/u/datajoint - name: DockerHub - - icon: fontawesome/brands/python - link: https://pypi.org/user/datajointbot - name: PyPI - - icon: fontawesome/brands/stack-overflow - link: https://stackoverflow.com/questions/tagged/datajoint - name: StackOverflow - - icon: fontawesome/brands/youtube - link: https://www.youtube.com/channel/UCdeCuFOTCXlVMRzh6Wk-lGg - name: YouTube + - icon: fontawesome/brands/slack + link: https://datajoint.slack.com + name: Slack extra_css: - assets/stylesheets/extra.css diff --git a/docs/src/architecture/index.md b/docs/src/architecture/index.md new file mode 100644 index 000000000..953fd7962 --- /dev/null +++ b/docs/src/architecture/index.md @@ -0,0 +1,34 @@ +# Architecture + +Internal design documentation for DataJoint developers. + +## Query System + +- [SQL Transpilation](transpilation.md) β€” How DataJoint translates query expressions to SQL + +## Design Principles + +DataJoint's architecture follows several key principles: + +1. **Immutable Query Expressions** β€” Query expressions are immutable; operators create new objects +2. **Lazy Evaluation** β€” Queries are not executed until data is fetched +3. **Query Optimization** β€” Unnecessary attributes are projected out before execution +4. **Semantic Matching** β€” Joins use lineage-based attribute matching + +## Module Overview + +| Module | Purpose | +|--------|---------| +| `expression.py` | QueryExpression base class and operators | +| `table.py` | Table class with data manipulation | +| `fetch.py` | Data retrieval implementation | +| `declare.py` | Table definition parsing | +| `heading.py` | Attribute and heading management | +| `blob.py` | Blob serialization | +| `codecs.py` | Type codec system | +| `connection.py` | Database connection management | +| `schemas.py` | Schema binding and activation | + +## Contributing + +See the [Contributing Guide](../develop.md) for development setup instructions. diff --git a/docs/src/internal/transpilation.md b/docs/src/architecture/transpilation.md similarity index 100% rename from docs/src/internal/transpilation.md rename to docs/src/architecture/transpilation.md diff --git a/docs/src/citation.md b/docs/src/archive/citation.md similarity index 100% rename from docs/src/citation.md rename to docs/src/archive/citation.md diff --git a/docs/src/client/credentials.md b/docs/src/archive/client/credentials.md similarity index 100% rename from docs/src/client/credentials.md rename to docs/src/archive/client/credentials.md diff --git a/docs/src/client/install.md b/docs/src/archive/client/install.md similarity index 100% rename from docs/src/client/install.md rename to docs/src/archive/client/install.md diff --git a/docs/src/client/settings.md b/docs/src/archive/client/settings.md similarity index 100% rename from docs/src/client/settings.md rename to docs/src/archive/client/settings.md diff --git a/docs/src/compute/autopopulate2.0-spec.md b/docs/src/archive/compute/autopopulate2.0-spec.md similarity index 100% rename from docs/src/compute/autopopulate2.0-spec.md rename to docs/src/archive/compute/autopopulate2.0-spec.md diff --git a/docs/src/compute/distributed.md b/docs/src/archive/compute/distributed.md similarity index 100% rename from docs/src/compute/distributed.md rename to docs/src/archive/compute/distributed.md diff --git a/docs/src/compute/key-source.md b/docs/src/archive/compute/key-source.md similarity index 100% rename from docs/src/compute/key-source.md rename to docs/src/archive/compute/key-source.md diff --git a/docs/src/compute/make.md b/docs/src/archive/compute/make.md similarity index 100% rename from docs/src/compute/make.md rename to docs/src/archive/compute/make.md diff --git a/docs/src/compute/populate.md b/docs/src/archive/compute/populate.md similarity index 100% rename from docs/src/compute/populate.md rename to docs/src/archive/compute/populate.md diff --git a/docs/src/concepts/data-model.md b/docs/src/archive/concepts/data-model.md similarity index 100% rename from docs/src/concepts/data-model.md rename to docs/src/archive/concepts/data-model.md diff --git a/docs/src/concepts/data-pipelines.md b/docs/src/archive/concepts/data-pipelines.md similarity index 100% rename from docs/src/concepts/data-pipelines.md rename to docs/src/archive/concepts/data-pipelines.md diff --git a/docs/src/concepts/principles.md b/docs/src/archive/concepts/principles.md similarity index 100% rename from docs/src/concepts/principles.md rename to docs/src/archive/concepts/principles.md diff --git a/docs/src/concepts/teamwork.md b/docs/src/archive/concepts/teamwork.md similarity index 100% rename from docs/src/concepts/teamwork.md rename to docs/src/archive/concepts/teamwork.md diff --git a/docs/src/concepts/terminology.md b/docs/src/archive/concepts/terminology.md similarity index 100% rename from docs/src/concepts/terminology.md rename to docs/src/archive/concepts/terminology.md diff --git a/docs/src/design/alter.md b/docs/src/archive/design/alter.md similarity index 100% rename from docs/src/design/alter.md rename to docs/src/archive/design/alter.md diff --git a/docs/src/design/diagrams.md b/docs/src/archive/design/diagrams.md similarity index 100% rename from docs/src/design/diagrams.md rename to docs/src/archive/design/diagrams.md diff --git a/docs/src/design/drop.md b/docs/src/archive/design/drop.md similarity index 100% rename from docs/src/design/drop.md rename to docs/src/archive/design/drop.md diff --git a/docs/src/design/fetch-api-2.0-spec.md b/docs/src/archive/design/fetch-api-2.0-spec.md similarity index 100% rename from docs/src/design/fetch-api-2.0-spec.md rename to docs/src/archive/design/fetch-api-2.0-spec.md diff --git a/docs/src/design/hidden-job-metadata-spec.md b/docs/src/archive/design/hidden-job-metadata-spec.md similarity index 100% rename from docs/src/design/hidden-job-metadata-spec.md rename to docs/src/archive/design/hidden-job-metadata-spec.md diff --git a/docs/src/design/integrity.md b/docs/src/archive/design/integrity.md similarity index 100% rename from docs/src/design/integrity.md rename to docs/src/archive/design/integrity.md diff --git a/docs/src/design/normalization.md b/docs/src/archive/design/normalization.md similarity index 100% rename from docs/src/design/normalization.md rename to docs/src/archive/design/normalization.md diff --git a/docs/src/design/pk-rules-spec.md b/docs/src/archive/design/pk-rules-spec.md similarity index 100% rename from docs/src/design/pk-rules-spec.md rename to docs/src/archive/design/pk-rules-spec.md diff --git a/docs/src/design/recall.md b/docs/src/archive/design/recall.md similarity index 100% rename from docs/src/design/recall.md rename to docs/src/archive/design/recall.md diff --git a/docs/src/design/schema.md b/docs/src/archive/design/schema.md similarity index 100% rename from docs/src/design/schema.md rename to docs/src/archive/design/schema.md diff --git a/docs/src/design/semantic-matching-spec.md b/docs/src/archive/design/semantic-matching-spec.md similarity index 100% rename from docs/src/design/semantic-matching-spec.md rename to docs/src/archive/design/semantic-matching-spec.md diff --git a/docs/src/design/tables/attach.md b/docs/src/archive/design/tables/attach.md similarity index 100% rename from docs/src/design/tables/attach.md rename to docs/src/archive/design/tables/attach.md diff --git a/docs/src/design/tables/attributes.md b/docs/src/archive/design/tables/attributes.md similarity index 100% rename from docs/src/design/tables/attributes.md rename to docs/src/archive/design/tables/attributes.md diff --git a/docs/src/design/tables/blobs.md b/docs/src/archive/design/tables/blobs.md similarity index 100% rename from docs/src/design/tables/blobs.md rename to docs/src/archive/design/tables/blobs.md diff --git a/docs/src/design/tables/codec-spec.md b/docs/src/archive/design/tables/codec-spec.md similarity index 100% rename from docs/src/design/tables/codec-spec.md rename to docs/src/archive/design/tables/codec-spec.md diff --git a/docs/src/design/tables/codecs.md b/docs/src/archive/design/tables/codecs.md similarity index 100% rename from docs/src/design/tables/codecs.md rename to docs/src/archive/design/tables/codecs.md diff --git a/docs/src/design/tables/declare.md b/docs/src/archive/design/tables/declare.md similarity index 100% rename from docs/src/design/tables/declare.md rename to docs/src/archive/design/tables/declare.md diff --git a/docs/src/design/tables/dependencies.md b/docs/src/archive/design/tables/dependencies.md similarity index 100% rename from docs/src/design/tables/dependencies.md rename to docs/src/archive/design/tables/dependencies.md diff --git a/docs/src/design/tables/filepath.md b/docs/src/archive/design/tables/filepath.md similarity index 100% rename from docs/src/design/tables/filepath.md rename to docs/src/archive/design/tables/filepath.md diff --git a/docs/src/design/tables/indexes.md b/docs/src/archive/design/tables/indexes.md similarity index 100% rename from docs/src/design/tables/indexes.md rename to docs/src/archive/design/tables/indexes.md diff --git a/docs/src/design/tables/lookup.md b/docs/src/archive/design/tables/lookup.md similarity index 100% rename from docs/src/design/tables/lookup.md rename to docs/src/archive/design/tables/lookup.md diff --git a/docs/src/design/tables/manual.md b/docs/src/archive/design/tables/manual.md similarity index 100% rename from docs/src/design/tables/manual.md rename to docs/src/archive/design/tables/manual.md diff --git a/docs/src/design/tables/master-part.md b/docs/src/archive/design/tables/master-part.md similarity index 100% rename from docs/src/design/tables/master-part.md rename to docs/src/archive/design/tables/master-part.md diff --git a/docs/src/design/tables/object.md b/docs/src/archive/design/tables/object.md similarity index 100% rename from docs/src/design/tables/object.md rename to docs/src/archive/design/tables/object.md diff --git a/docs/src/design/tables/primary.md b/docs/src/archive/design/tables/primary.md similarity index 100% rename from docs/src/design/tables/primary.md rename to docs/src/archive/design/tables/primary.md diff --git a/docs/src/design/tables/storage-types-spec.md b/docs/src/archive/design/tables/storage-types-spec.md similarity index 100% rename from docs/src/design/tables/storage-types-spec.md rename to docs/src/archive/design/tables/storage-types-spec.md diff --git a/docs/src/design/tables/tiers.md b/docs/src/archive/design/tables/tiers.md similarity index 100% rename from docs/src/design/tables/tiers.md rename to docs/src/archive/design/tables/tiers.md diff --git a/docs/src/faq.md b/docs/src/archive/faq.md similarity index 100% rename from docs/src/faq.md rename to docs/src/archive/faq.md diff --git a/docs/src/images/StudentTable.png b/docs/src/archive/images/StudentTable.png similarity index 100% rename from docs/src/images/StudentTable.png rename to docs/src/archive/images/StudentTable.png diff --git a/docs/src/images/added-example-ERD.svg b/docs/src/archive/images/added-example-ERD.svg similarity index 100% rename from docs/src/images/added-example-ERD.svg rename to docs/src/archive/images/added-example-ERD.svg diff --git a/docs/src/images/data-engineering.png b/docs/src/archive/images/data-engineering.png similarity index 100% rename from docs/src/images/data-engineering.png rename to docs/src/archive/images/data-engineering.png diff --git a/docs/src/images/data-science-after.png b/docs/src/archive/images/data-science-after.png similarity index 100% rename from docs/src/images/data-science-after.png rename to docs/src/archive/images/data-science-after.png diff --git a/docs/src/images/data-science-before.png b/docs/src/archive/images/data-science-before.png similarity index 100% rename from docs/src/images/data-science-before.png rename to docs/src/archive/images/data-science-before.png diff --git a/docs/src/images/diff-example1.png b/docs/src/archive/images/diff-example1.png similarity index 100% rename from docs/src/images/diff-example1.png rename to docs/src/archive/images/diff-example1.png diff --git a/docs/src/images/diff-example2.png b/docs/src/archive/images/diff-example2.png similarity index 100% rename from docs/src/images/diff-example2.png rename to docs/src/archive/images/diff-example2.png diff --git a/docs/src/images/diff-example3.png b/docs/src/archive/images/diff-example3.png similarity index 100% rename from docs/src/images/diff-example3.png rename to docs/src/archive/images/diff-example3.png diff --git a/docs/src/images/dimitri-ERD.svg b/docs/src/archive/images/dimitri-ERD.svg similarity index 100% rename from docs/src/images/dimitri-ERD.svg rename to docs/src/archive/images/dimitri-ERD.svg diff --git a/docs/src/images/doc_1-1.png b/docs/src/archive/images/doc_1-1.png similarity index 100% rename from docs/src/images/doc_1-1.png rename to docs/src/archive/images/doc_1-1.png diff --git a/docs/src/images/doc_1-many.png b/docs/src/archive/images/doc_1-many.png similarity index 100% rename from docs/src/images/doc_1-many.png rename to docs/src/archive/images/doc_1-many.png diff --git a/docs/src/images/doc_many-1.png b/docs/src/archive/images/doc_many-1.png similarity index 100% rename from docs/src/images/doc_many-1.png rename to docs/src/archive/images/doc_many-1.png diff --git a/docs/src/images/doc_many-many.png b/docs/src/archive/images/doc_many-many.png similarity index 100% rename from docs/src/images/doc_many-many.png rename to docs/src/archive/images/doc_many-many.png diff --git a/docs/src/images/how-it-works.png b/docs/src/archive/images/how-it-works.png similarity index 100% rename from docs/src/images/how-it-works.png rename to docs/src/archive/images/how-it-works.png diff --git a/docs/src/images/install-cmd-prompt.png b/docs/src/archive/images/install-cmd-prompt.png similarity index 100% rename from docs/src/images/install-cmd-prompt.png rename to docs/src/archive/images/install-cmd-prompt.png diff --git a/docs/src/images/install-datajoint-1.png b/docs/src/archive/images/install-datajoint-1.png similarity index 100% rename from docs/src/images/install-datajoint-1.png rename to docs/src/archive/images/install-datajoint-1.png diff --git a/docs/src/images/install-datajoint-2.png b/docs/src/archive/images/install-datajoint-2.png similarity index 100% rename from docs/src/images/install-datajoint-2.png rename to docs/src/archive/images/install-datajoint-2.png diff --git a/docs/src/images/install-git-1.png b/docs/src/archive/images/install-git-1.png similarity index 100% rename from docs/src/images/install-git-1.png rename to docs/src/archive/images/install-git-1.png diff --git a/docs/src/images/install-graphviz-1.png b/docs/src/archive/images/install-graphviz-1.png similarity index 100% rename from docs/src/images/install-graphviz-1.png rename to docs/src/archive/images/install-graphviz-1.png diff --git a/docs/src/images/install-graphviz-2a.png b/docs/src/archive/images/install-graphviz-2a.png similarity index 100% rename from docs/src/images/install-graphviz-2a.png rename to docs/src/archive/images/install-graphviz-2a.png diff --git a/docs/src/images/install-graphviz-2b.png b/docs/src/archive/images/install-graphviz-2b.png similarity index 100% rename from docs/src/images/install-graphviz-2b.png rename to docs/src/archive/images/install-graphviz-2b.png diff --git a/docs/src/images/install-jupyter-1.png b/docs/src/archive/images/install-jupyter-1.png similarity index 100% rename from docs/src/images/install-jupyter-1.png rename to docs/src/archive/images/install-jupyter-1.png diff --git a/docs/src/images/install-jupyter-2.png b/docs/src/archive/images/install-jupyter-2.png similarity index 100% rename from docs/src/images/install-jupyter-2.png rename to docs/src/archive/images/install-jupyter-2.png diff --git a/docs/src/images/install-matplotlib.png b/docs/src/archive/images/install-matplotlib.png similarity index 100% rename from docs/src/images/install-matplotlib.png rename to docs/src/archive/images/install-matplotlib.png diff --git a/docs/src/images/install-pydotplus.png b/docs/src/archive/images/install-pydotplus.png similarity index 100% rename from docs/src/images/install-pydotplus.png rename to docs/src/archive/images/install-pydotplus.png diff --git a/docs/src/images/install-python-advanced-1.png b/docs/src/archive/images/install-python-advanced-1.png similarity index 100% rename from docs/src/images/install-python-advanced-1.png rename to docs/src/archive/images/install-python-advanced-1.png diff --git a/docs/src/images/install-python-advanced-2.png b/docs/src/archive/images/install-python-advanced-2.png similarity index 100% rename from docs/src/images/install-python-advanced-2.png rename to docs/src/archive/images/install-python-advanced-2.png diff --git a/docs/src/images/install-python-simple.png b/docs/src/archive/images/install-python-simple.png similarity index 100% rename from docs/src/images/install-python-simple.png rename to docs/src/archive/images/install-python-simple.png diff --git a/docs/src/images/install-run-jupyter-1.png b/docs/src/archive/images/install-run-jupyter-1.png similarity index 100% rename from docs/src/images/install-run-jupyter-1.png rename to docs/src/archive/images/install-run-jupyter-1.png diff --git a/docs/src/images/install-run-jupyter-2.png b/docs/src/archive/images/install-run-jupyter-2.png similarity index 100% rename from docs/src/images/install-run-jupyter-2.png rename to docs/src/archive/images/install-run-jupyter-2.png diff --git a/docs/src/images/install-verify-graphviz.png b/docs/src/archive/images/install-verify-graphviz.png similarity index 100% rename from docs/src/images/install-verify-graphviz.png rename to docs/src/archive/images/install-verify-graphviz.png diff --git a/docs/src/images/install-verify-jupyter.png b/docs/src/archive/images/install-verify-jupyter.png similarity index 100% rename from docs/src/images/install-verify-jupyter.png rename to docs/src/archive/images/install-verify-jupyter.png diff --git a/docs/src/images/install-verify-python.png b/docs/src/archive/images/install-verify-python.png similarity index 100% rename from docs/src/images/install-verify-python.png rename to docs/src/archive/images/install-verify-python.png diff --git a/docs/src/images/join-example1.png b/docs/src/archive/images/join-example1.png similarity index 100% rename from docs/src/images/join-example1.png rename to docs/src/archive/images/join-example1.png diff --git a/docs/src/images/join-example2.png b/docs/src/archive/images/join-example2.png similarity index 100% rename from docs/src/images/join-example2.png rename to docs/src/archive/images/join-example2.png diff --git a/docs/src/images/join-example3.png b/docs/src/archive/images/join-example3.png similarity index 100% rename from docs/src/images/join-example3.png rename to docs/src/archive/images/join-example3.png diff --git a/docs/src/images/key_source_combination.png b/docs/src/archive/images/key_source_combination.png similarity index 100% rename from docs/src/images/key_source_combination.png rename to docs/src/archive/images/key_source_combination.png diff --git a/docs/src/images/map-dataflow.png b/docs/src/archive/images/map-dataflow.png similarity index 100% rename from docs/src/images/map-dataflow.png rename to docs/src/archive/images/map-dataflow.png diff --git a/docs/src/images/matched_tuples1.png b/docs/src/archive/images/matched_tuples1.png similarity index 100% rename from docs/src/images/matched_tuples1.png rename to docs/src/archive/images/matched_tuples1.png diff --git a/docs/src/images/matched_tuples2.png b/docs/src/archive/images/matched_tuples2.png similarity index 100% rename from docs/src/images/matched_tuples2.png rename to docs/src/archive/images/matched_tuples2.png diff --git a/docs/src/images/matched_tuples3.png b/docs/src/archive/images/matched_tuples3.png similarity index 100% rename from docs/src/images/matched_tuples3.png rename to docs/src/archive/images/matched_tuples3.png diff --git a/docs/src/images/mp-diagram.png b/docs/src/archive/images/mp-diagram.png similarity index 100% rename from docs/src/images/mp-diagram.png rename to docs/src/archive/images/mp-diagram.png diff --git a/docs/src/images/op-restrict.png b/docs/src/archive/images/op-restrict.png similarity index 100% rename from docs/src/images/op-restrict.png rename to docs/src/archive/images/op-restrict.png diff --git a/docs/src/images/outer-example1.png b/docs/src/archive/images/outer-example1.png similarity index 100% rename from docs/src/images/outer-example1.png rename to docs/src/archive/images/outer-example1.png diff --git a/docs/src/images/pipeline-database.png b/docs/src/archive/images/pipeline-database.png similarity index 100% rename from docs/src/images/pipeline-database.png rename to docs/src/archive/images/pipeline-database.png diff --git a/docs/src/images/pipeline.png b/docs/src/archive/images/pipeline.png similarity index 100% rename from docs/src/images/pipeline.png rename to docs/src/archive/images/pipeline.png diff --git a/docs/src/images/python_collection.png b/docs/src/archive/images/python_collection.png similarity index 100% rename from docs/src/images/python_collection.png rename to docs/src/archive/images/python_collection.png diff --git a/docs/src/images/queries_example_diagram.png b/docs/src/archive/images/queries_example_diagram.png similarity index 100% rename from docs/src/images/queries_example_diagram.png rename to docs/src/archive/images/queries_example_diagram.png diff --git a/docs/src/images/query_object_preview.png b/docs/src/archive/images/query_object_preview.png similarity index 100% rename from docs/src/images/query_object_preview.png rename to docs/src/archive/images/query_object_preview.png diff --git a/docs/src/images/restrict-example1.png b/docs/src/archive/images/restrict-example1.png similarity index 100% rename from docs/src/images/restrict-example1.png rename to docs/src/archive/images/restrict-example1.png diff --git a/docs/src/images/restrict-example2.png b/docs/src/archive/images/restrict-example2.png similarity index 100% rename from docs/src/images/restrict-example2.png rename to docs/src/archive/images/restrict-example2.png diff --git a/docs/src/images/restrict-example3.png b/docs/src/archive/images/restrict-example3.png similarity index 100% rename from docs/src/images/restrict-example3.png rename to docs/src/archive/images/restrict-example3.png diff --git a/docs/src/images/shapes_pipeline.svg b/docs/src/archive/images/shapes_pipeline.svg similarity index 100% rename from docs/src/images/shapes_pipeline.svg rename to docs/src/archive/images/shapes_pipeline.svg diff --git a/docs/src/images/spawned-classes-ERD.svg b/docs/src/archive/images/spawned-classes-ERD.svg similarity index 100% rename from docs/src/images/spawned-classes-ERD.svg rename to docs/src/archive/images/spawned-classes-ERD.svg diff --git a/docs/src/images/union-example1.png b/docs/src/archive/images/union-example1.png similarity index 100% rename from docs/src/images/union-example1.png rename to docs/src/archive/images/union-example1.png diff --git a/docs/src/images/union-example2.png b/docs/src/archive/images/union-example2.png similarity index 100% rename from docs/src/images/union-example2.png rename to docs/src/archive/images/union-example2.png diff --git a/docs/src/images/virtual-module-ERD.svg b/docs/src/archive/images/virtual-module-ERD.svg similarity index 100% rename from docs/src/images/virtual-module-ERD.svg rename to docs/src/archive/images/virtual-module-ERD.svg diff --git a/docs/src/manipulation/delete.md b/docs/src/archive/manipulation/delete.md similarity index 100% rename from docs/src/manipulation/delete.md rename to docs/src/archive/manipulation/delete.md diff --git a/docs/src/manipulation/index.md b/docs/src/archive/manipulation/index.md similarity index 100% rename from docs/src/manipulation/index.md rename to docs/src/archive/manipulation/index.md diff --git a/docs/src/manipulation/insert.md b/docs/src/archive/manipulation/insert.md similarity index 100% rename from docs/src/manipulation/insert.md rename to docs/src/archive/manipulation/insert.md diff --git a/docs/src/manipulation/transactions.md b/docs/src/archive/manipulation/transactions.md similarity index 100% rename from docs/src/manipulation/transactions.md rename to docs/src/archive/manipulation/transactions.md diff --git a/docs/src/manipulation/update.md b/docs/src/archive/manipulation/update.md similarity index 100% rename from docs/src/manipulation/update.md rename to docs/src/archive/manipulation/update.md diff --git a/docs/src/publish-data.md b/docs/src/archive/publish-data.md similarity index 100% rename from docs/src/publish-data.md rename to docs/src/archive/publish-data.md diff --git a/docs/src/query/aggregation.md b/docs/src/archive/query/aggregation.md similarity index 100% rename from docs/src/query/aggregation.md rename to docs/src/archive/query/aggregation.md diff --git a/docs/src/query/example-schema.md b/docs/src/archive/query/example-schema.md similarity index 100% rename from docs/src/query/example-schema.md rename to docs/src/archive/query/example-schema.md diff --git a/docs/src/query/fetch.md b/docs/src/archive/query/fetch.md similarity index 100% rename from docs/src/query/fetch.md rename to docs/src/archive/query/fetch.md diff --git a/docs/src/query/iteration.md b/docs/src/archive/query/iteration.md similarity index 100% rename from docs/src/query/iteration.md rename to docs/src/archive/query/iteration.md diff --git a/docs/src/query/join.md b/docs/src/archive/query/join.md similarity index 100% rename from docs/src/query/join.md rename to docs/src/archive/query/join.md diff --git a/docs/src/query/operators.md b/docs/src/archive/query/operators.md similarity index 100% rename from docs/src/query/operators.md rename to docs/src/archive/query/operators.md diff --git a/docs/src/query/principles.md b/docs/src/archive/query/principles.md similarity index 100% rename from docs/src/query/principles.md rename to docs/src/archive/query/principles.md diff --git a/docs/src/query/project.md b/docs/src/archive/query/project.md similarity index 100% rename from docs/src/query/project.md rename to docs/src/archive/query/project.md diff --git a/docs/src/query/query-caching.md b/docs/src/archive/query/query-caching.md similarity index 100% rename from docs/src/query/query-caching.md rename to docs/src/archive/query/query-caching.md diff --git a/docs/src/query/restrict.md b/docs/src/archive/query/restrict.md similarity index 100% rename from docs/src/query/restrict.md rename to docs/src/archive/query/restrict.md diff --git a/docs/src/query/union.md b/docs/src/archive/query/union.md similarity index 100% rename from docs/src/query/union.md rename to docs/src/archive/query/union.md diff --git a/docs/src/query/universals.md b/docs/src/archive/query/universals.md similarity index 100% rename from docs/src/query/universals.md rename to docs/src/archive/query/universals.md diff --git a/docs/src/quick-start.md b/docs/src/archive/quick-start.md similarity index 100% rename from docs/src/quick-start.md rename to docs/src/archive/quick-start.md diff --git a/docs/src/sysadmin/bulk-storage.md b/docs/src/archive/sysadmin/bulk-storage.md similarity index 100% rename from docs/src/sysadmin/bulk-storage.md rename to docs/src/archive/sysadmin/bulk-storage.md diff --git a/docs/src/sysadmin/database-admin.md b/docs/src/archive/sysadmin/database-admin.md similarity index 100% rename from docs/src/sysadmin/database-admin.md rename to docs/src/archive/sysadmin/database-admin.md diff --git a/docs/src/sysadmin/external-store.md b/docs/src/archive/sysadmin/external-store.md similarity index 100% rename from docs/src/sysadmin/external-store.md rename to docs/src/archive/sysadmin/external-store.md diff --git a/docs/src/tutorials/dj-top.ipynb b/docs/src/archive/tutorials/dj-top.ipynb similarity index 100% rename from docs/src/tutorials/dj-top.ipynb rename to docs/src/archive/tutorials/dj-top.ipynb diff --git a/docs/src/tutorials/json.ipynb b/docs/src/archive/tutorials/json.ipynb similarity index 100% rename from docs/src/tutorials/json.ipynb rename to docs/src/archive/tutorials/json.ipynb diff --git a/docs/src/index.md b/docs/src/index.md index 6e3bf2a2d..57d6b99d9 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -1,44 +1,45 @@ -# Welcome to DataJoint for Python! +# DataJoint for Python -DataJoint for Python is a framework for scientific workflow management based on -relational principles. DataJoint is built on the foundation of the relational data -model and prescribes a consistent method for organizing, populating, computing, and -querying data. +DataJoint is an open-source Python framework for building scientific data pipelines. +It implements the **Relational Workflow Model**β€”a paradigm that extends relational +databases with native support for computational workflows. -DataJoint was initially developed in 2009 by Dimitri Yatsenko in Andreas Tolias' Lab at -Baylor College of Medicine for the distributed processing and management of large -volumes of data streaming from regular experiments. Starting in 2011, DataJoint has -been available as an open-source project adopted by other labs and improved through -contributions from several developers. -Presently, the primary developer of DataJoint open-source software is the company [DataJoint](https://datajoint.com){:target="_blank"}. +## Documentation -## Data Pipeline Example +**User documentation** is available at **[docs.datajoint.com](https://docs.datajoint.com)**, including: -![pipeline](https://raw.githubusercontent.com/datajoint/datajoint-python/master/images/pipeline.png) +- Tutorials and getting started guides +- Concepts and explanations +- How-to guides +- API reference -[Yatsenko et al., bioRxiv 2021](https://doi.org/10.1101/2021.03.30.437358){:target="_blank"} +## This Site -## Getting Started +This site contains **developer documentation** for contributors to the DataJoint codebase: -- Install with Conda +- [Contributing Guide](develop.md) β€” Development environment setup +- [Architecture](architecture/index.md) β€” Internal design documentation +- [Changelog](changelog.md) β€” Release history +- [API Reference](api/) β€” Auto-generated from source - ```bash - conda install -c conda-forge datajoint - ``` +## Quick Links -- Install with pip +| Resource | Link | +|----------|------| +| User Documentation | [docs.datajoint.com](https://docs.datajoint.com) | +| GitHub Repository | [github.com/datajoint/datajoint-python](https://github.com/datajoint/datajoint-python) | +| PyPI Package | [pypi.org/project/datajoint](https://pypi.org/project/datajoint) | +| Issue Tracker | [GitHub Issues](https://github.com/datajoint/datajoint-python/issues) | +| Community | [DataJoint Slack](https://datajoint.slack.com) | - ```bash - pip install datajoint - ``` +## Installation -- [Quick Start Guide](./quick-start.md) +```bash +pip install datajoint +``` -- [Interactive Tutorials](https://github.com/datajoint/datajoint-tutorials){:target="_blank"} on GitHub Codespaces +## License -- [DataJoint Elements](https://docs.datajoint.com/elements/) - Catalog of example pipelines for neuroscience experiments +DataJoint is released under the [Apache 2.0 License](https://github.com/datajoint/datajoint-python/blob/master/LICENSE). -- Contribute - - [Development Environment](./develop) - - - [Guidelines](https://docs.datajoint.com/about/contribute/) +Copyright 2024 DataJoint Inc. and contributors. From 2108a0c3b50a7440bf2f09894445672e2b9910b5 Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Sun, 4 Jan 2026 14:30:02 -0600 Subject: [PATCH 07/15] docs: Remove changelog (now auto-generated by GitHub) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- docs/mkdocs.yaml | 1 - docs/src/changelog.md | 1 - docs/src/index.md | 1 - 3 files changed, 3 deletions(-) delete mode 120000 docs/src/changelog.md diff --git a/docs/mkdocs.yaml b/docs/mkdocs.yaml index 554e456cc..db2ea16f9 100644 --- a/docs/mkdocs.yaml +++ b/docs/mkdocs.yaml @@ -10,7 +10,6 @@ nav: - Architecture: - architecture/index.md - SQL Transpilation: architecture/transpilation.md - - Changelog: changelog.md - API Reference: api/ # defer to gen-files + literate-nav # ---------------------------- STANDARD ----------------------------- diff --git a/docs/src/changelog.md b/docs/src/changelog.md deleted file mode 120000 index 699cc9e7b..000000000 --- a/docs/src/changelog.md +++ /dev/null @@ -1 +0,0 @@ -../../CHANGELOG.md \ No newline at end of file diff --git a/docs/src/index.md b/docs/src/index.md index 57d6b99d9..63b318a1c 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -19,7 +19,6 @@ This site contains **developer documentation** for contributors to the DataJoint - [Contributing Guide](develop.md) β€” Development environment setup - [Architecture](architecture/index.md) β€” Internal design documentation -- [Changelog](changelog.md) β€” Release history - [API Reference](api/) β€” Auto-generated from source ## Quick Links From b3d3014f7d14ca207b62f085df34286d82cb71ca Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Sun, 4 Jan 2026 14:44:07 -0600 Subject: [PATCH 08/15] docs: Streamline contributing guide MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add quick start code block at top - Lead with local venv setup (primary path) - Move Codespaces to alternative section - Remove manual TOC and back-to-top links - Simplify pre-commit and testing sections - Link to DOCSTRING_STYLE.md for docstring guidelines πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- docs/src/develop.md | 215 ++++++++++++-------------------------------- 1 file changed, 57 insertions(+), 158 deletions(-) diff --git a/docs/src/develop.md b/docs/src/develop.md index a4a1fc534..4643683b6 100644 --- a/docs/src/develop.md +++ b/docs/src/develop.md @@ -1,202 +1,101 @@ -# Developer Guide +# Contributing Guide -## Table of Contents - -- [Contribute to DataJoint Python Documentation](#contribute-to-datajoint-python-documentation) -- [Setup Development Environment](#setup-development-environment) - - [Prerequisites](#prerequisites) - - [With Virtual Environment](#with-virtual-environment) - - [With DevContainer](#with-devcontainer) - - [Extra Efficiency, Optional But Recommended](#extra-efficiency-optional-but-recommended) - - [Pre-commit Hooks](#pre-commit-hooks) - - [Integration Tests](#integration-tests) - - [VSCode](#vscode) - - [Jupyter Extension](#jupyter-extension) - - [Debugger](#debugger) - - [MySQL CLI](#mysql-cli) - -## Contribute to DataJoint Python Documentation - -> Contributions to documentations are equivalently important to any code for the community, please help us to resolve any confusions in documentations. - -[Here](https://github.com/datajoint/datajoint-python/blob/master/docs/README.md) is the instructions for contributing documentations, or you can find the same instructions at `$PROJECT_DIR/docs/README.md` in the repository. - -[Back to top](#table-of-contents) - -## Setup Development Environment - -> We have [DevContainer](https://containers.dev/) ready for contributors to develop without setting up their environment. If you are familiar with DevContainer, Docker or Github Codespace, this is the recommended development environment for you. -> If you have never used Docker, it might be easier for you to use a virtual environment through `conda/mamba/venv`, it is also very straightforward to set up. - -### Prerequisites - -- Clone datajoint-python repository +## Quick Start ```bash -# If you have your SSH key set up with GitHub, you can clone using SSH -git clone git@github.com:datajoint/datajoint-python.git -# Otherwise, you can clone using HTTPS +# Clone the repository git clone https://github.com/datajoint/datajoint-python.git -``` -- If you don't use DevContainer, then either install Anaconda/[Miniconda](https://www.anaconda.com/docs/getting-started/miniconda/install)/Mamba, or just use Python's built-in `venv` module without install anything else. - -### With Virtual Environment +cd datajoint-python -```bash -# Check if you have Python 3.10 or higher, if not please upgrade -python --version -# Create a virtual environment with venv +# Create virtual environment (Python 3.10+) python -m venv .venv -source .venv/bin/activate -pip install -e .[dev] +source .venv/bin/activate # On Windows: .venv\Scripts\activate -# Or create a virtual environment with conda -conda create -n dj python=3.13 # any 3.10+ is fine -conda activate dj -pip install -e .[dev] -``` +# Install with development dependencies +pip install -e ".[dev]" -[Back to top](#table-of-contents) +# Install pre-commit hooks +pre-commit install -### With DevContainer +# Run tests +pytest tests +``` -#### Launch Environment +## Development Environment -Here are some options that provide a great developer experience: +### Local Setup -- **Cloud-based IDE**: (*recommended*) - - Launch using [GitHub Codespaces](https://github.com/features/codespaces) using the option `Create codespace on master` in the codebase repository on your fork. - - Build time for a 2-Core codespace is **~6m**. This is done infrequently and cached for convenience. - - Start time for a 2-Core codespace is **~2m**. This will pull the built codespace from cache when you need it. - - *Tip*: GitHub auto names the codespace but you can rename the codespace so that it is easier to identify later. -- **Local IDE (VSCode - Dev Containers)**: - - Ensure you have [Git](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git) - - Ensure you have [Docker](https://docs.docker.com/get-docker/) - - Ensure you have [VSCode](https://code.visualstudio.com/) - - Install the [Dev Containers extension](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers) - - `git clone` the codebase repository and open it in VSCode - - Use the `Dev Containers extension` to `Reopen in Container` (More info in the `Getting started` included with the extension) - - You will know your environment has finished loading once you see a terminal open related to `Running postStartCommand` with a final message: `Done. Press any key to close the terminal.`. -- **Local IDE (Docker Compose)**: - - Ensure you have [Git](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git) - - Ensure you have [Docker](https://docs.docker.com/get-docker/) - - `git clone` the codebase repository and open it in VSCode - - Issue the following command in the terminal to build and run the Docker container: `HOST_UID=$(id -u) PY_VER=3.11 DJ_VERSION=$(grep -oP '\d+\.\d+\.\d+' src/datajoint/version.py) docker compose --profile test run --rm -it djtest -- sh -c 'pip install -qe ".[dev]" && bash'` - - Issue the following command in the terminal to stop the Docker compose stack: `docker compose --profile test down` +Requirements: -[Back to top](#table-of-contents) +- Python 3.10 or higher +- MySQL 8.0+ or Docker (for running tests) -## Extra Efficiency, Optional But Recommended +The `[dev]` extras install all development tools: pytest, pre-commit, black, ruff, and documentation builders. -### Pre-commit Hooks +### Using Docker for Database -We recommend using [pre-commit](https://pre-commit.com/) to automatically run linters and formatters on your code before committing. -To set up pre-commit, run the following command in your terminal: +Tests require a MySQL database. Start one with Docker: ```bash -pip install pre-commit -pre-commit install +docker compose up -d db ``` -You can manually run pre-commit on all files with the following command: +Configure connection (or set environment variables): ```bash -pre-commit run --all-files +export DJ_HOST=localhost +export DJ_USER=root +export DJ_PASS=password ``` -This will run all the linters and formatters specified in the `.pre-commit-config.yaml` file. If all check passed, you can commit your code. Otherwise, you need to fix the failed checks and run the command again. -> Pre-commit will automatically run the linters and formatters on all staged files before committing. However, if your code doesn't follow the linters and formatters, the commit will fail. -> Some hooks will automatically fix your problem, and add the fixed files as git's `unstaged` files, you just need to add them(`git add .`) to git's `staged` files and commit again. -> Some hooks will not automatically fix your problem, so you need to check the pre-commit failed log to fix them manually and include the update to your `staged` files and commit again. +### Alternative: GitHub Codespaces -If you really don't want to use pre-commit, or if you don't like it, you can uninstall it with the following command: +For a pre-configured environment, use [GitHub Codespaces](https://github.com/features/codespaces): -```bash -pre-commit uninstall -``` +1. Fork the repository +2. Click "Create codespace on master" +3. Wait for environment to build (~6 minutes first time, ~2 minutes from cache) -But when you issue a pull request, the same linter and formatter check will run against your contribution, you are going to have the same failure as well. So without pre-commit, you need to **manually run these linters and formatters before committing your code**: +## Code Quality -- Syntax tests +### Pre-commit Hooks -The following will verify that there are no syntax errors. +Pre-commit runs automatically on `git commit`. To run manually: -``` -flake8 datajoint --count --select=E9,F63,F7,F82 --show-source --statistics +```bash +pre-commit run --all-files ``` -- Style tests +Hooks include: -The following will verify that there are no code styling errors. +- **ruff** β€” Linting and import sorting +- **black** β€” Code formatting +- **mypy** β€” Type checking (optional) -``` -flake8 --ignore=E203,E722,W503 datajoint --count --max-complexity=62 --max-line-length=127 --statistics -``` - -The following will ensure the codebase has been formatted with [black](https://black.readthedocs.io/en/stable/). +### Running Tests -``` -black datajoint --check -v --diff -``` +```bash +# Full test suite with coverage +pytest -sv --cov=datajoint tests -The following will ensure the test suite has been formatted with [black](https://black.readthedocs.io/en/stable/). +# Single test file +pytest tests/test_connection.py +# Single test function +pytest tests/test_connection.py::test_dj_conn -v ``` -black tests --check -v --diff -``` - -[Back to top](#table-of-contents) - -### Integration Tests - -The following will verify there are no regression errors by running our test suite of unit and integration tests. - -- Entire test suite: - ``` - pytest -sv --cov-report term-missing --cov=datajoint tests - ``` - -- A single functional test: - ``` - pytest -sv tests/test_connection.py::test_dj_conn - ``` -- A single class test: - ``` - pytest -sv tests/test_aggr_regressions.py::TestIssue558 - ``` -[Back to top](#table-of-contents) +## Submitting Changes -### VSCode +1. Create a feature branch from `master` +2. Make your changes +3. Ensure tests pass and pre-commit is clean +4. Submit a pull request -#### Jupyter Extension +PRs trigger CI checks automatically. All checks must pass before merge. -Be sure to go through this documentation if you are new to [Running Jupyter Notebooks with VSCode](https://code.visualstudio.com/docs/datascience/jupyter-notebooks#_create-or-open-a-jupyter-notebook). +## Documentation -#### Debugger - -[VSCode Debugger](https://code.visualstudio.com/docs/editor/debugging) is a powerful tool that can really accelerate fixes. - -Try it as follows: - -- Create a python script of your choice -- `import datajoint` (This will use the current state of the source) -- Add breakpoints by adding red dots next to line numbers -- Select the `Run and Debug` tab -- Start by clicking the button `Run and Debug` - -[Back to top](#table-of-contents) - -### MySQL CLI - -> Installation instruction is in [here](https://dev.mysql.com/doc/mysql-shell/8.0/en/mysql-shell-install.html) - -It is often useful in development to connect to DataJoint's relational database backend directly using the MySQL CLI. - -Connect as follows to the database running within your developer environment: - -``` -mysql -hdb -uroot -ppassword -``` +Docstrings use NumPy style. See [DOCSTRING_STYLE.md](https://github.com/datajoint/datajoint-python/blob/master/DOCSTRING_STYLE.md) for guidelines. -[Back to top](#table-of-contents) \ No newline at end of file +User documentation is maintained at [docs.datajoint.com](https://docs.datajoint.com). From 13b73a0375ce45849757000ce95b1ac021d1fec8 Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Sun, 4 Jan 2026 23:43:40 -0600 Subject: [PATCH 09/15] docs: Add table declaration specification MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Comprehensive spec covering: - Table tiers and class structure - Definition string grammar - Attribute types (core, string, temporal, codec) - Default values and nullable attributes - Foreign key references and options - Index declarations - Part tables - Auto-populated tables - Validation rules - SQL generation πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- specs/table-declaration.md | 587 +++++++++++++++++++++++++++++++++++++ 1 file changed, 587 insertions(+) create mode 100644 specs/table-declaration.md diff --git a/specs/table-declaration.md b/specs/table-declaration.md new file mode 100644 index 000000000..f5f878b78 --- /dev/null +++ b/specs/table-declaration.md @@ -0,0 +1,587 @@ +# DataJoint Table Declaration Specification + +Version: 1.0 +Status: Draft +Last Updated: 2025-01-04 + +## Overview + +This document specifies the table declaration mechanism in DataJoint Python. Table declarations define the schema structure using a domain-specific language (DSL) embedded in Python class definitions. + +## 1. Table Class Structure + +### 1.1 Basic Declaration Pattern + +```python +@schema +class TableName(dj.Manual): + definition = """ + # table comment + primary_attr : int32 + --- + secondary_attr : float64 + """ +``` + +### 1.2 Table Tiers + +| Tier | Base Class | Table Prefix | Purpose | +|------|------------|--------------|---------| +| Manual | `dj.Manual` | (none) | User-entered data | +| Lookup | `dj.Lookup` | `#` | Reference/enumeration data | +| Imported | `dj.Imported` | `_` | Data from external sources | +| Computed | `dj.Computed` | `__` | Derived from other tables | +| Part | `dj.Part` | `master__` | Detail records of master table | + +### 1.3 Class Naming Rules + +- **Format**: Strict CamelCase (e.g., `MyTable`, `ProcessedData`) +- **Pattern**: `^[A-Z][A-Za-z0-9]*$` +- **Conversion**: CamelCase to snake_case for SQL table name +- **Examples**: + - `SessionTrial` -> `session_trial` + - `ProcessedEMG` -> `processed_emg` + +### 1.4 Table Name Constraints + +- **Maximum length**: 64 characters (MySQL limit) +- **Final name**: prefix + snake_case(class_name) +- **Validation**: Checked at declaration time + +--- + +## 2. Definition String Grammar + +### 2.1 Overall Structure + +``` +[table_comment] +primary_key_section +--- +secondary_section +``` + +### 2.2 Table Comment (Optional) + +``` +# Free-form description of the table purpose +``` + +- Must be first non-empty line if present +- Starts with `#` +- Cannot start with `#:` +- Stored in MySQL table COMMENT + +### 2.3 Primary Key Separator + +``` +--- +``` + +or equivalently: + +``` +___ +``` + +- Three dashes or three underscores +- Separates primary key attributes (above) from secondary attributes (below) +- Required if table has secondary attributes + +### 2.4 Line Types + +Each non-empty, non-comment line is one of: + +1. **Attribute definition** +2. **Foreign key reference** +3. **Index declaration** + +--- + +## 3. Attribute Definition + +### 3.1 Syntax + +``` +attribute_name [= default_value] : type [# comment] +``` + +### 3.2 Components + +| Component | Required | Description | +|-----------|----------|-------------| +| `attribute_name` | Yes | Identifier for the column | +| `default_value` | No | Default value (before colon) | +| `type` | Yes | Data type specification | +| `comment` | No | Documentation (after `#`) | + +### 3.3 Attribute Name Rules + +- **Pattern**: `^[a-z][a-z0-9_]*$` +- **Start**: Lowercase letter +- **Contains**: Lowercase letters, digits, underscores +- **Convention**: snake_case + +### 3.4 Examples + +```python +definition = """ +# Experimental session with subject and timing info +session_id : int32 # auto-assigned +--- +subject_name : varchar(100) # subject identifier +trial_number = 1 : int32 # default to 1 +score = null : float32 # nullable +timestamp = CURRENT_TIMESTAMP : datetime # auto-timestamp +notes = '' : varchar(4000) # empty default +""" +``` + +--- + +## 4. Type System + +### 4.1 Core Types + +Scientist-friendly type names with guaranteed semantics: + +| Type | SQL Mapping | Size | Description | +|------|-------------|------|-------------| +| `int8` | `tinyint` | 1 byte | 8-bit signed integer | +| `uint8` | `tinyint unsigned` | 1 byte | 8-bit unsigned integer | +| `int16` | `smallint` | 2 bytes | 16-bit signed integer | +| `uint16` | `smallint unsigned` | 2 bytes | 16-bit unsigned integer | +| `int32` | `int` | 4 bytes | 32-bit signed integer | +| `uint32` | `int unsigned` | 4 bytes | 32-bit unsigned integer | +| `int64` | `bigint` | 8 bytes | 64-bit signed integer | +| `uint64` | `bigint unsigned` | 8 bytes | 64-bit unsigned integer | +| `float32` | `float` | 4 bytes | 32-bit IEEE 754 float | +| `float64` | `double` | 8 bytes | 64-bit IEEE 754 float | +| `bool` | `tinyint` | 1 byte | Boolean (0 or 1) | +| `uuid` | `binary(16)` | 16 bytes | UUID stored as binary | +| `bytes` | `longblob` | Variable | Binary data (up to 4GB) | + +### 4.2 String Types + +| Type | SQL Mapping | Description | +|------|-------------|-------------| +| `char(N)` | `char(N)` | Fixed-length string | +| `varchar(N)` | `varchar(N)` | Variable-length string (max N) | +| `text` | `text` | Unlimited text | +| `enum('a','b',...)` | `enum(...)` | Enumerated values | + +### 4.3 Temporal Types + +| Type | SQL Mapping | Description | +|------|-------------|-------------| +| `date` | `date` | Date (YYYY-MM-DD) | +| `datetime` | `datetime` | Date and time | +| `datetime(N)` | `datetime(N)` | With fractional seconds (0-6) | + +### 4.4 Other Types + +| Type | SQL Mapping | Description | +|------|-------------|-------------| +| `json` | `json` | JSON document | +| `decimal(P,S)` | `decimal(P,S)` | Fixed-point decimal | + +### 4.5 Native SQL Types (Passthrough) + +These SQL types are accepted but generate a warning recommending core types: + +- Integer variants: `tinyint`, `smallint`, `mediumint`, `bigint`, `integer`, `serial` +- Float variants: `float`, `double`, `real` (with size specifiers) +- Text variants: `tinytext`, `mediumtext`, `longtext` +- Blob variants: `tinyblob`, `smallblob`, `mediumblob`, `longblob` +- Temporal: `time`, `timestamp`, `year` +- Numeric: `numeric(P,S)` + +### 4.6 Codec Types + +Format: `` or `` + +| Codec | Internal dtype | External dtype | Purpose | +|-------|---------------|----------------|---------| +| `` | `bytes` | `` | Serialized Python objects | +| `` | N/A (external only) | `json` | Hash-addressed deduped storage | +| `` | `bytes` | `` | File attachments with filename | +| `` | N/A (external only) | `json` | Reference to managed file | +| `` | N/A (external only) | `json` | Object storage (Zarr, HDF5) | + +External storage syntax: +- `` - default store +- `` - named store + +### 4.7 Type Reconstruction + +Core types and codecs are stored in the SQL COMMENT field for reconstruction: + +```sql +COMMENT ':float32:user comment here' +COMMENT '::user comment' +``` + +--- + +## 5. Default Values + +### 5.1 Syntax + +``` +attribute_name = default_value : type +``` + +### 5.2 Literal Types + +| Value | Meaning | SQL | +|-------|---------|-----| +| `null` | Nullable attribute | `DEFAULT NULL` | +| `CURRENT_TIMESTAMP` | Server timestamp | `DEFAULT CURRENT_TIMESTAMP` | +| `"string"` or `'string'` | String literal | `DEFAULT "string"` | +| `123` | Numeric literal | `DEFAULT 123` | +| `true`/`false` | Boolean | `DEFAULT 1`/`DEFAULT 0` | + +### 5.3 Constant Literals + +These values are used without quotes in SQL: +- `NULL` +- `CURRENT_TIMESTAMP` + +### 5.4 Nullable Attributes + +``` +score = null : float32 +``` + +- The special default `null` (case-insensitive) makes the attribute nullable +- Nullable attributes can be omitted from INSERT +- Primary key attributes CANNOT be nullable + +### 5.5 Blob/JSON Default Restrictions + +Blob and JSON attributes can only have `null` as default: + +```python +# Valid +data = null : + +# Invalid - raises DataJointError +data = '' : +``` + +--- + +## 6. Foreign Key References + +### 6.1 Syntax + +``` +-> [options] ReferencedTable +``` + +### 6.2 Options + +| Option | Effect | +|--------|--------| +| `nullable` | All inherited attributes become nullable | +| `unique` | Creates UNIQUE INDEX on FK attributes | + +Options are comma-separated in brackets: +``` +-> [nullable, unique] ParentTable +``` + +### 6.3 Attribute Inheritance + +Foreign keys automatically inherit all primary key attributes from the referenced table: + +```python +# Parent +class Subject(dj.Manual): + definition = """ + subject_id : int32 + --- + name : varchar(100) + """ + +# Child - inherits subject_id +class Session(dj.Manual): + definition = """ + -> Subject + session_id : int32 + --- + session_date : date + """ +``` + +### 6.4 Position Rules + +| Position | Effect | +|----------|--------| +| Before `---` | FK attributes become part of primary key | +| After `---` | FK attributes are secondary (dependent) | + +### 6.5 Nullable Foreign Keys + +``` +-> [nullable] OptionalParent +``` + +- Only allowed after `---` (secondary) +- Primary key FKs cannot be nullable +- Creates optional relationship + +### 6.6 Unique Foreign Keys + +``` +-> [unique] ParentTable +``` + +- Creates UNIQUE INDEX on inherited attributes +- Enforces one-to-one relationship from child perspective + +### 6.7 Projections in Foreign Keys + +``` +-> Parent.proj(alias='original_name') +``` + +- Reference same table multiple times with different attribute names +- Useful for self-referential or multi-reference patterns + +### 6.8 Referential Actions + +All foreign keys use: +- `ON UPDATE CASCADE` - Parent key changes propagate +- `ON DELETE RESTRICT` - Cannot delete parent with children + +### 6.9 Lineage Tracking + +Foreign key relationships are recorded in the `~lineage` table: + +```python +{ + 'child_attr': ('parent_schema.parent_table', 'parent_attr') +} +``` + +Used for semantic attribute matching in queries. + +--- + +## 7. Index Declarations + +### 7.1 Syntax + +``` +index(attr1, attr2, ...) +unique index(attr1, attr2, ...) +``` + +### 7.2 Examples + +```python +definition = """ +# User contact information +user_id : int32 +--- +first_name : varchar(50) +last_name : varchar(50) +email : varchar(100) +index(last_name, first_name) +unique index(email) +""" +``` + +### 7.3 Computed Expressions + +Indexes can include SQL expressions: + +``` +index(last_name, (YEAR(birth_date))) +``` + +### 7.4 Limitations + +- Cannot be altered after table creation (via `table.alter()`) +- Must reference existing attributes + +--- + +## 8. Part Tables + +### 8.1 Declaration + +```python +@schema +class Master(dj.Manual): + definition = """ + master_id : int32 + """ + + class Detail(dj.Part): + definition = """ + -> master + detail_id : int32 + --- + value : float32 + """ +``` + +### 8.2 Naming + +- SQL name: `master_table__part_name` +- Example: `experiment__trial` + +### 8.3 Master Reference + +Within Part definition, use: +- `-> master` (lowercase keyword) +- `-> MasterClassName` (class name) + +### 8.4 Constraints + +- Parts must reference their master +- Cannot delete Part records directly (use master) +- Cannot drop Part table directly (use master) +- Part inherits master's primary key + +--- + +## 9. Auto-Populated Tables + +### 9.1 Classes + +- `dj.Imported` - Data from external sources +- `dj.Computed` - Derived from other DataJoint tables + +### 9.2 Primary Key Constraint + +All primary key attributes must come from foreign key references. + +**Valid:** +```python +class Analysis(dj.Computed): + definition = """ + -> Session + -> Parameter + --- + result : float64 + """ +``` + +**Invalid** (by default): +```python +class Analysis(dj.Computed): + definition = """ + -> Session + analysis_id : int32 # ERROR: non-FK primary key + --- + result : float64 + """ +``` + +**Override:** +```python +dj.config['jobs.allow_new_pk_fields_in_computed_tables'] = True +``` + +### 9.3 Job Metadata + +When `config['jobs.add_job_metadata'] = True`, auto-populated tables receive: + +| Column | Type | Description | +|--------|------|-------------| +| `_job_start_time` | `datetime(3)` | Job start timestamp | +| `_job_duration` | `float64` | Duration in seconds | +| `_job_version` | `varchar(64)` | Code version | + +--- + +## 10. Validation + +### 10.1 Parse-Time Checks + +| Check | Error | +|-------|-------| +| Unknown type | `DataJointError: Unsupported attribute type` | +| Invalid attribute name | `DataJointError: Declaration error` | +| Comment starts with `:` | `DataJointError: comment must not start with colon` | +| Non-null blob default | `DataJointError: default value for blob can only be NULL` | + +### 10.2 Declaration-Time Checks + +| Check | Error | +|-------|-------| +| Table name > 64 chars | `DataJointError: Table name exceeds max length` | +| No primary key | `DataJointError: Table must have a primary key` | +| Nullable primary key attr | `DataJointError: Primary key attributes cannot be nullable` | +| Invalid CamelCase | `DataJointError: Invalid table name` | +| FK resolution failure | `DataJointError: Foreign key reference could not be resolved` | + +### 10.3 Insert-Time Validation + +The `table.validate()` method checks: +- Required fields present +- NULL constraints satisfied +- Primary key completeness +- Codec validation (if defined) +- UUID format +- JSON serializability + +--- + +## 11. SQL Generation + +### 11.1 CREATE TABLE Template + +```sql +CREATE TABLE `schema`.`table_name` ( + `attr1` TYPE1 NOT NULL COMMENT "...", + `attr2` TYPE2 DEFAULT NULL COMMENT "...", + PRIMARY KEY (`pk1`, `pk2`), + FOREIGN KEY (`fk_attr`) REFERENCES `parent` (`pk`) + ON UPDATE CASCADE ON DELETE RESTRICT, + INDEX (`idx_attr`), + UNIQUE INDEX (`uniq_attr`) +) ENGINE=InnoDB COMMENT="table comment" +``` + +### 11.2 Type Comment Encoding + +Core types and codecs are preserved in comments: + +```sql +`value` float NOT NULL COMMENT ":float32:measurement value" +`data` longblob DEFAULT NULL COMMENT "::serialized data" +`archive` json DEFAULT NULL COMMENT "::external storage" +``` + +--- + +## 12. Implementation Files + +| File | Purpose | +|------|---------| +| `declare.py` | Definition parsing, SQL generation | +| `heading.py` | Attribute metadata, type reconstruction | +| `table.py` | Base Table class, declaration interface | +| `user_tables.py` | Tier classes (Manual, Computed, etc.) | +| `schemas.py` | Schema binding, table decoration | +| `codecs.py` | Codec registry and resolution | +| `lineage.py` | Attribute lineage tracking | + +--- + +## 13. Future Considerations + +Potential improvements identified for the declaration system: + +1. **Better error messages** with suggestions and context +2. **Import-time validation** via `__init_subclass__` +3. **Parser alternatives** (regex-based for simpler grammar) +4. **SQL dialect abstraction** for multi-database support +5. **Extended constraints** (CHECK, custom validation) +6. **Migration support** for schema evolution +7. **Definition caching** for performance +8. **IDE tooling** support via structured intermediate representation From c16e79fdaeb31b5aac2f8d01453950f0640630a6 Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Tue, 6 Jan 2026 07:45:11 -0600 Subject: [PATCH 10/15] docs: Update README with Relational Workflow Model and OAS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Align introduction with documentation home page - Explain Relational Workflow Model core concepts - Add Object-Augmented Schemas (OAS) section - Announce DataJoint 2.0 with migration guide reference πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- README.md | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index c6d60b75b..85c3269e7 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,21 @@ # DataJoint for Python -DataJoint is an open-source Python framework for building scientific data pipelines. -It implements the **Relational Workflow Model**β€”a paradigm that extends relational -databases with native support for computational workflows. +DataJoint is a framework for scientific data pipelines that introduces the **Relational Workflow Model**β€”a paradigm where your database schema is an executable specification of your workflow. -**Key Features:** +Traditional databases store data but don't understand how it was computed. DataJoint extends relational databases with native workflow semantics: -- **Declarative schema design** β€” Define tables and relationships in Python -- **Automatic dependency tracking** β€” Foreign keys encode workflow dependencies -- **Built-in computation** β€” Imported and Computed tables run automatically -- **Data integrity** β€” Referential integrity and transaction support -- **Reproducibility** β€” Immutable data with full provenance +- **Tables represent workflow steps** β€” Each table is a step in your pipeline where entities are created +- **Foreign keys encode dependencies** β€” Parent tables must be populated before child tables +- **Computations are declarative** β€” Define *what* to compute; DataJoint determines *when* and tracks *what's done* +- **Results are immutable** β€” Computed results preserve full provenance and reproducibility + +### Object-Augmented Schemas + +Scientific data includes both structured metadata and large data objects (time series, images, movies, neural recordings, gene sequences). DataJoint solves this with **Object-Augmented Schemas (OAS)**β€”a unified architecture where relational tables and object storage are managed as one system with identical guarantees for integrity, transactions, and lifecycle. + +### DataJoint 2.0 + +**DataJoint 2.0** solidifies these core concepts with a modernized API, improved type system, and enhanced object storage integration. Existing users can refer to the [Migration Guide](https://docs.datajoint.com/migration/) for upgrading from earlier versions. **Documentation:** https://docs.datajoint.com From aa1db4f70936040cb15c39a9219a1d6215600ece Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Tue, 6 Jan 2026 14:00:41 -0600 Subject: [PATCH 11/15] Allow empty insert for tables with all defaults (#1280) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - When all attributes have defaults (autoincrement, nullable, or explicit default), allow inserting empty dicts: table.insert1({}) - Generates SQL: INSERT INTO table () VALUES () - For tables with required fields, raise clear error listing which attributes need values - Add tests for empty insert scenarios Closes #1280 πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- src/datajoint/table.py | 21 ++++++++-- src/datajoint/version.py | 2 +- tests/integration/test_insert.py | 69 ++++++++++++++++++++++++++++++++ 3 files changed, 87 insertions(+), 5 deletions(-) diff --git a/src/datajoint/table.py b/src/datajoint/table.py index fb1615a11..82b42f781 100644 --- a/src/datajoint/table.py +++ b/src/datajoint/table.py @@ -864,10 +864,12 @@ def _insert_rows(self, rows, replace, skip_duplicates, ignore_extra_fields): rows = list(self.__make_row_to_insert(row, field_list, ignore_extra_fields) for row in rows) if rows: try: - query = "{command} INTO {destination}(`{fields}`) VALUES {placeholders}{duplicate}".format( + # Handle empty field_list (all-defaults insert) + fields_clause = f"(`{'`,`'.join(field_list)}`)" if field_list else "()" + query = "{command} INTO {destination}{fields} VALUES {placeholders}{duplicate}".format( command="REPLACE" if replace else "INSERT", destination=self.from_clause(), - fields="`,`".join(field_list), + fields=fields_clause, placeholders=",".join("(" + ",".join(row["placeholders"]) + ")" for row in rows), duplicate=( " ON DUPLICATE KEY UPDATE `{pk}`=`{pk}`".format(pk=self.primary_key[0]) if skip_duplicates else "" @@ -1457,8 +1459,19 @@ def check_fields(fields): if ignore_extra_fields: attributes = [a for a in attributes if a is not None] - assert len(attributes), "Empty tuple" - row_to_insert = dict(zip(("names", "placeholders", "values"), zip(*attributes))) + if not attributes: + # Check if empty insert is allowed (all attributes have defaults) + required_attrs = [ + attr.name + for attr in self.heading.attributes.values() + if not (attr.autoincrement or attr.nullable or attr.default is not None) + ] + if required_attrs: + raise DataJointError(f"Cannot insert empty row. The following attributes require values: {required_attrs}") + # All attributes have defaults - allow empty insert + row_to_insert = {"names": (), "placeholders": (), "values": ()} + else: + row_to_insert = dict(zip(("names", "placeholders", "values"), zip(*attributes))) if not field_list: # first row sets the composition of the field list field_list.extend(row_to_insert["names"]) diff --git a/src/datajoint/version.py b/src/datajoint/version.py index da2a4c956..5e26c773f 100644 --- a/src/datajoint/version.py +++ b/src/datajoint/version.py @@ -1,4 +1,4 @@ # version bump auto managed by Github Actions: # label_prs.yaml(prep), release.yaml(bump), post_release.yaml(edit) # manually set this version will be eventually overwritten by the above actions -__version__ = "2.0.0a12" +__version__ = "2.0.0a13" diff --git a/tests/integration/test_insert.py b/tests/integration/test_insert.py index 8352c303b..de22e5565 100644 --- a/tests/integration/test_insert.py +++ b/tests/integration/test_insert.py @@ -438,3 +438,72 @@ def test_validation_result_summary_truncated(self, schema_insert): result = dj.ValidationResult(is_valid=False, errors=errors, rows_checked=20) summary = result.summary() assert "and 10 more errors" in summary + + +class AllDefaultsTable(dj.Manual): + """Table where all attributes have defaults.""" + + definition = """ + id : int auto_increment + --- + timestamp=CURRENT_TIMESTAMP : datetime + notes=null : varchar(200) + """ + + +class TestEmptyInsert: + """Tests for inserting empty dicts (GitHub issue #1280).""" + + @pytest.fixture + def schema_empty_insert(self, connection_test, prefix): + schema = dj.Schema( + prefix + "_empty_insert_test", + context=dict(AllDefaultsTable=AllDefaultsTable, SimpleTable=SimpleTable), + connection=connection_test, + ) + schema(AllDefaultsTable) + schema(SimpleTable) + yield schema + schema.drop() + + def test_empty_insert_all_defaults(self, schema_empty_insert): + """Test that empty insert succeeds when all attributes have defaults.""" + table = AllDefaultsTable() + assert len(table) == 0 + + # Insert empty dict - should use all defaults + table.insert1({}) + assert len(table) == 1 + + # Check that values were populated with defaults + row = table.fetch1() + assert row["id"] == 1 # auto_increment starts at 1 + assert row["timestamp"] is not None # CURRENT_TIMESTAMP + assert row["notes"] is None # nullable defaults to NULL + + def test_empty_insert_multiple(self, schema_empty_insert): + """Test inserting multiple empty dicts.""" + table = AllDefaultsTable() + + # Insert multiple empty dicts + table.insert([{}, {}, {}]) + assert len(table) == 3 + + # Each should have unique auto_increment id + ids = set(table.to_arrays("id")) + assert ids == {1, 2, 3} + + def test_empty_insert_required_fields_error(self, schema_empty_insert): + """Test that empty insert raises clear error when fields are required.""" + table = SimpleTable() + + # SimpleTable has required fields (id, value) + with pytest.raises(dj.DataJointError) as exc_info: + table.insert1({}) + + error_msg = str(exc_info.value) + assert "Cannot insert empty row" in error_msg + assert "require values" in error_msg + # Should list the required attributes + assert "id" in error_msg + assert "value" in error_msg From 736860ebde8077abb14c29aeb2a985ddc34609a2 Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Tue, 6 Jan 2026 16:40:32 -0600 Subject: [PATCH 12/15] Make Codec.get_dtype() an abstractmethod for consistency MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All required codec methods (get_dtype, encode, decode) now use @abstractmethod to catch missing implementations at class definition time rather than runtime. πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- src/datajoint/codecs.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/datajoint/codecs.py b/src/datajoint/codecs.py index 6eca19155..ab9f4fcf4 100644 --- a/src/datajoint/codecs.py +++ b/src/datajoint/codecs.py @@ -119,6 +119,7 @@ def __init_subclass__(cls, *, register: bool = True, **kwargs): _codec_registry[cls.name] = cls() logger.debug(f"Registered codec <{cls.name}> from {cls.__module__}.{cls.__name__}") + @abstractmethod def get_dtype(self, is_external: bool) -> str: """ Return the storage dtype for this codec. @@ -136,12 +137,10 @@ def get_dtype(self, is_external: bool) -> str: Raises ------ - NotImplementedError - If not overridden by subclass. DataJointError If external storage not supported but requested. """ - raise NotImplementedError(f"Codec <{self.name}> must implement get_dtype()") + ... @abstractmethod def encode(self, value: Any, *, key: dict | None = None, store_name: str | None = None) -> Any: From 9207d83fd2c176dc9a0e7b12a0baf3cd0afde744 Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Tue, 6 Jan 2026 17:48:39 -0600 Subject: [PATCH 13/15] fix: Handle np.bool_ in insert and fix download_path tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add np.bool_ to isinstance checks in table.py for proper boolean handling during insert (line 536 validate, line 1172 placeholder) - Fix test_attach.py and test_update1.py to use dj.config.override() instead of deprecated download_path argument πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- src/datajoint/table.py | 4 ++-- tests/integration/test_attach.py | 39 ++++++++++++++++++------------- tests/integration/test_update1.py | 3 ++- 3 files changed, 27 insertions(+), 19 deletions(-) diff --git a/src/datajoint/table.py b/src/datajoint/table.py index aff79bdd6..8f15971e6 100644 --- a/src/datajoint/table.py +++ b/src/datajoint/table.py @@ -533,7 +533,7 @@ def validate(self, rows, *, ignore_extra_fields=False) -> ValidationResult: continue # Numeric NaN check - if attr.numeric and value != "" and not isinstance(value, bool): + if attr.numeric and value != "" and not isinstance(value, (bool, np.bool_)): try: if np.isnan(float(value)): # NaN is allowed - will be converted to NULL @@ -1169,7 +1169,7 @@ def __make_placeholder(self, name, value, ignore_extra_fields=False, row=None): value = json.dumps(value) # Numeric - convert to string elif attr.numeric: - value = str(int(value) if isinstance(value, bool) else value) + value = str(int(value) if isinstance(value, (bool, np.bool_)) else value) # Blob - pass through as bytes (use for automatic serialization) return name, placeholder, value diff --git a/tests/integration/test_attach.py b/tests/integration/test_attach.py index 27916a601..f7ad953fe 100644 --- a/tests/integration/test_attach.py +++ b/tests/integration/test_attach.py @@ -7,6 +7,8 @@ def test_attach_attributes(schema_ext, minio_client, tmpdir_factory): """Test saving files in attachments""" + import datajoint as dj + # create a mock file table = Attach() source_folder = tmpdir_factory.mktemp("source") @@ -23,27 +25,31 @@ def test_attach_attributes(schema_ext, minio_client, tmpdir_factory): download_folder = Path(tmpdir_factory.mktemp("download")) keys = table.keys(order_by="KEY") - path1, path2 = table.to_arrays("img", "txt", download_path=download_folder, order_by="KEY") - # verify that different attachment are renamed if their filenames collide - assert path1[0] != path2[0] - assert path1[0] != path1[1] - assert Path(path1[0]).parent == download_folder - with Path(path1[-1]).open("rb") as f: - check1 = f.read() - with Path(path2[-1]).open("rb") as f: - check2 = f.read() - assert data1 == check1 - assert data2 == check2 + with dj.config.override(download_path=str(download_folder)): + path1, path2 = table.to_arrays("img", "txt", order_by="KEY") + + # verify that different attachment are renamed if their filenames collide + assert path1[0] != path2[0] + assert path1[0] != path1[1] + assert Path(path1[0]).parent == download_folder + with Path(path1[-1]).open("rb") as f: + check1 = f.read() + with Path(path2[-1]).open("rb") as f: + check2 = f.read() + assert data1 == check1 + assert data2 == check2 - # verify that existing files are not duplicated if their filename matches issue #592 - p1, p2 = (Attach & keys[0]).fetch1("img", "txt", download_path=download_folder) - assert p1 == path1[0] - assert p2 == path2[0] + # verify that existing files are not duplicated if their filename matches issue #592 + p1, p2 = (Attach & keys[0]).fetch1("img", "txt") + assert p1 == path1[0] + assert p2 == path2[0] def test_return_string(schema_ext, minio_client, tmpdir_factory): """Test returning string on fetch""" + import datajoint as dj + # create a mock file table = Attach() source_folder = tmpdir_factory.mktemp("source") @@ -59,6 +65,7 @@ def test_return_string(schema_ext, minio_client, tmpdir_factory): table.insert1(dict(attach=2, img=attach1, txt=attach2)) download_folder = Path(tmpdir_factory.mktemp("download")) - path1, path2 = table.to_arrays("img", "txt", download_path=download_folder, order_by="KEY") + with dj.config.override(download_path=str(download_folder)): + path1, path2 = table.to_arrays("img", "txt", order_by="KEY") assert isinstance(path1[0], str) diff --git a/tests/integration/test_update1.py b/tests/integration/test_update1.py index eb525a6be..ef6255bcc 100644 --- a/tests/integration/test_update1.py +++ b/tests/integration/test_update1.py @@ -86,7 +86,8 @@ def test_update1(tmpdir, schema_update1, mock_stores_update): # Insert the relative path within the store Thing.update1(dict(key, img_file=f"{relpath}/{filename}")) - check2 = Thing.fetch1(download_path=tmpdir) + with dj.config.override(download_path=str(tmpdir)): + check2 = Thing.fetch1() buffer2 = Path(check2["picture"]).read_bytes() # read attachment # For filepath, fetch returns ObjectRef - read the file through it filepath_ref = check2["img_file"] From 1cf37b8ef0dee97ed887d5bb841c470e0fdfb204 Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Wed, 7 Jan 2026 09:45:42 -0600 Subject: [PATCH 14/15] Fix missing newline at end of version.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- src/datajoint/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datajoint/version.py b/src/datajoint/version.py index 3d875a71a..31f651ea6 100644 --- a/src/datajoint/version.py +++ b/src/datajoint/version.py @@ -1,4 +1,4 @@ # version bump auto managed by Github Actions: # label_prs.yaml(prep), release.yaml(bump), post_release.yaml(edit) # manually set this version will be eventually overwritten by the above actions -__version__ = "2.0.0a16" \ No newline at end of file +__version__ = "2.0.0a16" From ff7f71e99b73ac556a8cadc61ad1006c1845b60e Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Wed, 7 Jan 2026 10:57:54 -0600 Subject: [PATCH 15/15] license: Clean up Apache 2.0 transition (fixes #1235) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove LICENSE.txt (old LGPL 2.1 file) - Update pyproject.toml to reference LICENSE and use Apache classifier - Update __init__.py docstring with Apache 2.0 notice - Standardize copyright: 2014-2026 DataJoint Inc. and contributors πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- LICENSE | 2 +- LICENSE.txt | 504 -------------------------------------- pyproject.toml | 4 +- src/datajoint/__init__.py | 20 +- 4 files changed, 13 insertions(+), 517 deletions(-) delete mode 100644 LICENSE.txt diff --git a/LICENSE b/LICENSE index 4cdf770f0..3f8b99424 100644 --- a/LICENSE +++ b/LICENSE @@ -175,7 +175,7 @@ END OF TERMS AND CONDITIONS - Copyright 2024 DataJoint Inc. and contributors + Copyright 2014-2026 DataJoint Inc. and contributors Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/LICENSE.txt b/LICENSE.txt deleted file mode 100644 index 90f4edaaa..000000000 --- a/LICENSE.txt +++ /dev/null @@ -1,504 +0,0 @@ - GNU LESSER GENERAL PUBLIC LICENSE - Version 2.1, February 1999 - - Copyright (C) 1991, 1999 Free Software Foundation, Inc. - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - Everyone is permitted to copy and distribute verbatim copies - of this license document, but changing it is not allowed. - -(This is the first released version of the Lesser GPL. It also counts - as the successor of the GNU Library Public License, version 2, hence - the version number 2.1.) - - Preamble - - The licenses for most software are designed to take away your -freedom to share and change it. By contrast, the GNU General Public -Licenses are intended to guarantee your freedom to share and change -free software--to make sure the software is free for all its users. - - This license, the Lesser General Public License, applies to some -specially designated software packages--typically libraries--of the -Free Software Foundation and other authors who decide to use it. You -can use it too, but we suggest you first think carefully about whether -this license or the ordinary General Public License is the better -strategy to use in any particular case, based on the explanations below. - - When we speak of free software, we are referring to freedom of use, -not price. Our General Public Licenses are designed to make sure that -you have the freedom to distribute copies of free software (and charge -for this service if you wish); that you receive source code or can get -it if you want it; that you can change the software and use pieces of -it in new free programs; and that you are informed that you can do -these things. - - To protect your rights, we need to make restrictions that forbid -distributors to deny you these rights or to ask you to surrender these -rights. These restrictions translate to certain responsibilities for -you if you distribute copies of the library or if you modify it. - - For example, if you distribute copies of the library, whether gratis -or for a fee, you must give the recipients all the rights that we gave -you. You must make sure that they, too, receive or can get the source -code. If you link other code with the library, you must provide -complete object files to the recipients, so that they can relink them -with the library after making changes to the library and recompiling -it. And you must show them these terms so they know their rights. - - We protect your rights with a two-step method: (1) we copyright the -library, and (2) we offer you this license, which gives you legal -permission to copy, distribute and/or modify the library. - - To protect each distributor, we want to make it very clear that -there is no warranty for the free library. Also, if the library is -modified by someone else and passed on, the recipients should know -that what they have is not the original version, so that the original -author's reputation will not be affected by problems that might be -introduced by others. - - Finally, software patents pose a constant threat to the existence of -any free program. We wish to make sure that a company cannot -effectively restrict the users of a free program by obtaining a -restrictive license from a patent holder. Therefore, we insist that -any patent license obtained for a version of the library must be -consistent with the full freedom of use specified in this license. - - Most GNU software, including some libraries, is covered by the -ordinary GNU General Public License. This license, the GNU Lesser -General Public License, applies to certain designated libraries, and -is quite different from the ordinary General Public License. We use -this license for certain libraries in order to permit linking those -libraries into non-free programs. - - When a program is linked with a library, whether statically or using -a shared library, the combination of the two is legally speaking a -combined work, a derivative of the original library. The ordinary -General Public License therefore permits such linking only if the -entire combination fits its criteria of freedom. The Lesser General -Public License permits more lax criteria for linking other code with -the library. - - We call this license the "Lesser" General Public License because it -does Less to protect the user's freedom than the ordinary General -Public License. It also provides other free software developers Less -of an advantage over competing non-free programs. These disadvantages -are the reason we use the ordinary General Public License for many -libraries. However, the Lesser license provides advantages in certain -special circumstances. - - For example, on rare occasions, there may be a special need to -encourage the widest possible use of a certain library, so that it becomes -a de-facto standard. To achieve this, non-free programs must be -allowed to use the library. A more frequent case is that a free -library does the same job as widely used non-free libraries. In this -case, there is little to gain by limiting the free library to free -software only, so we use the Lesser General Public License. - - In other cases, permission to use a particular library in non-free -programs enables a greater number of people to use a large body of -free software. For example, permission to use the GNU C Library in -non-free programs enables many more people to use the whole GNU -operating system, as well as its variant, the GNU/Linux operating -system. - - Although the Lesser General Public License is Less protective of the -users' freedom, it does ensure that the user of a program that is -linked with the Library has the freedom and the wherewithal to run -that program using a modified version of the Library. - - The precise terms and conditions for copying, distribution and -modification follow. Pay close attention to the difference between a -"work based on the library" and a "work that uses the library". The -former contains code derived from the library, whereas the latter must -be combined with the library in order to run. - - GNU LESSER GENERAL PUBLIC LICENSE - TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION - - 0. This License Agreement applies to any software library or other -program which contains a notice placed by the copyright holder or -other authorized party saying it may be distributed under the terms of -this Lesser General Public License (also called "this License"). -Each licensee is addressed as "you". - - A "library" means a collection of software functions and/or data -prepared so as to be conveniently linked with application programs -(which use some of those functions and data) to form executables. - - The "Library", below, refers to any such software library or work -which has been distributed under these terms. A "work based on the -Library" means either the Library or any derivative work under -copyright law: that is to say, a work containing the Library or a -portion of it, either verbatim or with modifications and/or translated -straightforwardly into another language. (Hereinafter, translation is -included without limitation in the term "modification".) - - "Source code" for a work means the preferred form of the work for -making modifications to it. For a library, complete source code means -all the source code for all modules it contains, plus any associated -interface definition files, plus the scripts used to control compilation -and installation of the library. - - Activities other than copying, distribution and modification are not -covered by this License; they are outside its scope. The act of -running a program using the Library is not restricted, and output from -such a program is covered only if its contents constitute a work based -on the Library (independent of the use of the Library in a tool for -writing it). Whether that is true depends on what the Library does -and what the program that uses the Library does. - - 1. You may copy and distribute verbatim copies of the Library's -complete source code as you receive it, in any medium, provided that -you conspicuously and appropriately publish on each copy an -appropriate copyright notice and disclaimer of warranty; keep intact -all the notices that refer to this License and to the absence of any -warranty; and distribute a copy of this License along with the -Library. - - You may charge a fee for the physical act of transferring a copy, -and you may at your option offer warranty protection in exchange for a -fee. - - 2. You may modify your copy or copies of the Library or any portion -of it, thus forming a work based on the Library, and copy and -distribute such modifications or work under the terms of Section 1 -above, provided that you also meet all of these conditions: - - a) The modified work must itself be a software library. - - b) You must cause the files modified to carry prominent notices - stating that you changed the files and the date of any change. - - c) You must cause the whole of the work to be licensed at no - charge to all third parties under the terms of this License. - - d) If a facility in the modified Library refers to a function or a - table of data to be supplied by an application program that uses - the facility, other than as an argument passed when the facility - is invoked, then you must make a good faith effort to ensure that, - in the event an application does not supply such function or - table, the facility still operates, and performs whatever part of - its purpose remains meaningful. - - (For example, a function in a library to compute square roots has - a purpose that is entirely well-defined independent of the - application. Therefore, Subsection 2d requires that any - application-supplied function or table used by this function must - be optional: if the application does not supply it, the square - root function must still compute square roots.) - -These requirements apply to the modified work as a whole. If -identifiable sections of that work are not derived from the Library, -and can be reasonably considered independent and separate works in -themselves, then this License, and its terms, do not apply to those -sections when you distribute them as separate works. But when you -distribute the same sections as part of a whole which is a work based -on the Library, the distribution of the whole must be on the terms of -this License, whose permissions for other licensees extend to the -entire whole, and thus to each and every part regardless of who wrote -it. - -Thus, it is not the intent of this section to claim rights or contest -your rights to work written entirely by you; rather, the intent is to -exercise the right to control the distribution of derivative or -collective works based on the Library. - -In addition, mere aggregation of another work not based on the Library -with the Library (or with a work based on the Library) on a volume of -a storage or distribution medium does not bring the other work under -the scope of this License. - - 3. You may opt to apply the terms of the ordinary GNU General Public -License instead of this License to a given copy of the Library. To do -this, you must alter all the notices that refer to this License, so -that they refer to the ordinary GNU General Public License, version 2, -instead of to this License. (If a newer version than version 2 of the -ordinary GNU General Public License has appeared, then you can specify -that version instead if you wish.) Do not make any other change in -these notices. - - Once this change is made in a given copy, it is irreversible for -that copy, so the ordinary GNU General Public License applies to all -subsequent copies and derivative works made from that copy. - - This option is useful when you wish to copy part of the code of -the Library into a program that is not a library. - - 4. You may copy and distribute the Library (or a portion or -derivative of it, under Section 2) in object code or executable form -under the terms of Sections 1 and 2 above provided that you accompany -it with the complete corresponding machine-readable source code, which -must be distributed under the terms of Sections 1 and 2 above on a -medium customarily used for software interchange. - - If distribution of object code is made by offering access to copy -from a designated place, then offering equivalent access to copy the -source code from the same place satisfies the requirement to -distribute the source code, even though third parties are not -compelled to copy the source along with the object code. - - 5. A program that contains no derivative of any portion of the -Library, but is designed to work with the Library by being compiled or -linked with it, is called a "work that uses the Library". Such a -work, in isolation, is not a derivative work of the Library, and -therefore falls outside the scope of this License. - - However, linking a "work that uses the Library" with the Library -creates an executable that is a derivative of the Library (because it -contains portions of the Library), rather than a "work that uses the -library". The executable is therefore covered by this License. -Section 6 states terms for distribution of such executables. - - When a "work that uses the Library" uses material from a header file -that is part of the Library, the object code for the work may be a -derivative work of the Library even though the source code is not. -Whether this is true is especially significant if the work can be -linked without the Library, or if the work is itself a library. The -threshold for this to be true is not precisely defined by law. - - If such an object file uses only numerical parameters, data -structure layouts and accessors, and small macros and small inline -functions (ten lines or less in length), then the use of the object -file is unrestricted, regardless of whether it is legally a derivative -work. (Executables containing this object code plus portions of the -Library will still fall under Section 6.) - - Otherwise, if the work is a derivative of the Library, you may -distribute the object code for the work under the terms of Section 6. -Any executables containing that work also fall under Section 6, -whether or not they are linked directly with the Library itself. - - 6. As an exception to the Sections above, you may also combine or -link a "work that uses the Library" with the Library to produce a -work containing portions of the Library, and distribute that work -under terms of your choice, provided that the terms permit -modification of the work for the customer's own use and reverse -engineering for debugging such modifications. - - You must give prominent notice with each copy of the work that the -Library is used in it and that the Library and its use are covered by -this License. You must supply a copy of this License. If the work -during execution displays copyright notices, you must include the -copyright notice for the Library among them, as well as a reference -directing the user to the copy of this License. Also, you must do one -of these things: - - a) Accompany the work with the complete corresponding - machine-readable source code for the Library including whatever - changes were used in the work (which must be distributed under - Sections 1 and 2 above); and, if the work is an executable linked - with the Library, with the complete machine-readable "work that - uses the Library", as object code and/or source code, so that the - user can modify the Library and then relink to produce a modified - executable containing the modified Library. (It is understood - that the user who changes the contents of definitions files in the - Library will not necessarily be able to recompile the application - to use the modified definitions.) - - b) Use a suitable shared library mechanism for linking with the - Library. A suitable mechanism is one that (1) uses at run time a - copy of the library already present on the user's computer system, - rather than copying library functions into the executable, and (2) - will operate properly with a modified version of the library, if - the user installs one, as long as the modified version is - interface-compatible with the version that the work was made with. - - c) Accompany the work with a written offer, valid for at - least three years, to give the same user the materials - specified in Subsection 6a, above, for a charge no more - than the cost of performing this distribution. - - d) If distribution of the work is made by offering access to copy - from a designated place, offer equivalent access to copy the above - specified materials from the same place. - - e) Verify that the user has already received a copy of these - materials or that you have already sent this user a copy. - - For an executable, the required form of the "work that uses the -Library" must include any data and utility programs needed for -reproducing the executable from it. However, as a special exception, -the materials to be distributed need not include anything that is -normally distributed (in either source or binary form) with the major -components (compiler, kernel, and so on) of the operating system on -which the executable runs, unless that component itself accompanies -the executable. - - It may happen that this requirement contradicts the license -restrictions of other proprietary libraries that do not normally -accompany the operating system. Such a contradiction means you cannot -use both them and the Library together in an executable that you -distribute. - - 7. You may place library facilities that are a work based on the -Library side-by-side in a single library together with other library -facilities not covered by this License, and distribute such a combined -library, provided that the separate distribution of the work based on -the Library and of the other library facilities is otherwise -permitted, and provided that you do these two things: - - a) Accompany the combined library with a copy of the same work - based on the Library, uncombined with any other library - facilities. This must be distributed under the terms of the - Sections above. - - b) Give prominent notice with the combined library of the fact - that part of it is a work based on the Library, and explaining - where to find the accompanying uncombined form of the same work. - - 8. You may not copy, modify, sublicense, link with, or distribute -the Library except as expressly provided under this License. Any -attempt otherwise to copy, modify, sublicense, link with, or -distribute the Library is void, and will automatically terminate your -rights under this License. However, parties who have received copies, -or rights, from you under this License will not have their licenses -terminated so long as such parties remain in full compliance. - - 9. You are not required to accept this License, since you have not -signed it. However, nothing else grants you permission to modify or -distribute the Library or its derivative works. These actions are -prohibited by law if you do not accept this License. Therefore, by -modifying or distributing the Library (or any work based on the -Library), you indicate your acceptance of this License to do so, and -all its terms and conditions for copying, distributing or modifying -the Library or works based on it. - - 10. Each time you redistribute the Library (or any work based on the -Library), the recipient automatically receives a license from the -original licensor to copy, distribute, link with or modify the Library -subject to these terms and conditions. You may not impose any further -restrictions on the recipients' exercise of the rights granted herein. -You are not responsible for enforcing compliance by third parties with -this License. - - 11. If, as a consequence of a court judgment or allegation of patent -infringement or for any other reason (not limited to patent issues), -conditions are imposed on you (whether by court order, agreement or -otherwise) that contradict the conditions of this License, they do not -excuse you from the conditions of this License. If you cannot -distribute so as to satisfy simultaneously your obligations under this -License and any other pertinent obligations, then as a consequence you -may not distribute the Library at all. For example, if a patent -license would not permit royalty-free redistribution of the Library by -all those who receive copies directly or indirectly through you, then -the only way you could satisfy both it and this License would be to -refrain entirely from distribution of the Library. - -If any portion of this section is held invalid or unenforceable under any -particular circumstance, the balance of the section is intended to apply, -and the section as a whole is intended to apply in other circumstances. - -It is not the purpose of this section to induce you to infringe any -patents or other property right claims or to contest validity of any -such claims; this section has the sole purpose of protecting the -integrity of the free software distribution system which is -implemented by public license practices. Many people have made -generous contributions to the wide range of software distributed -through that system in reliance on consistent application of that -system; it is up to the author/donor to decide if he or she is willing -to distribute software through any other system and a licensee cannot -impose that choice. - -This section is intended to make thoroughly clear what is believed to -be a consequence of the rest of this License. - - 12. If the distribution and/or use of the Library is restricted in -certain countries either by patents or by copyrighted interfaces, the -original copyright holder who places the Library under this License may add -an explicit geographical distribution limitation excluding those countries, -so that distribution is permitted only in or among countries not thus -excluded. In such case, this License incorporates the limitation as if -written in the body of this License. - - 13. The Free Software Foundation may publish revised and/or new -versions of the Lesser General Public License from time to time. -Such new versions will be similar in spirit to the present version, -but may differ in detail to address new problems or concerns. - -Each version is given a distinguishing version number. If the Library -specifies a version number of this License which applies to it and -"any later version", you have the option of following the terms and -conditions either of that version or of any later version published by -the Free Software Foundation. If the Library does not specify a -license version number, you may choose any version ever published by -the Free Software Foundation. - - 14. If you wish to incorporate parts of the Library into other free -programs whose distribution conditions are incompatible with these, -write to the author to ask for permission. For software which is -copyrighted by the Free Software Foundation, write to the Free -Software Foundation; we sometimes make exceptions for this. Our -decision will be guided by the two goals of preserving the free status -of all derivatives of our free software and of promoting the sharing -and reuse of software generally. - - NO WARRANTY - - 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO -WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. -EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR -OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY -KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE -LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME -THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. - - 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN -WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY -AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU -FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR -CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE -LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING -RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A -FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF -SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH -DAMAGES. - - END OF TERMS AND CONDITIONS - - How to Apply These Terms to Your New Libraries - - If you develop a new library, and you want it to be of the greatest -possible use to the public, we recommend making it free software that -everyone can redistribute and change. You can do so by permitting -redistribution under these terms (or, alternatively, under the terms of the -ordinary General Public License). - - To apply these terms, attach the following notices to the library. It is -safest to attach them to the start of each source file to most effectively -convey the exclusion of warranty; and each file should have at least the -"copyright" line and a pointer to where the full notice is found. - - {description} - Copyright (C) {year} {fullname} - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with this library; if not, write to the Free Software - Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 - USA - -Also add information on how to contact you by electronic and paper mail. - -You should also get your employer (if you work as a programmer) or your -school, if any, to sign a "copyright disclaimer" for the library, if -necessary. Here is a sample; alter the names: - - Yoyodyne, Inc., hereby disclaims all copyright interest in the - library `Frob' (a library for tweaking knobs) written by James Random - Hacker. - - {signature of Ty Coon}, 1 April 1990 - Ty Coon, President of Vice - -That's all there is to it! diff --git a/pyproject.toml b/pyproject.toml index b719cdb73..f3eee2313 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,7 +39,7 @@ maintainers = [ # manually sync here: https://docs.datajoint.com/core/datajoint-python/latest/#welcome-to-datajoint-for-python description = "DataJoint for Python is a framework for scientific workflow management based on relational principles. DataJoint is built on the foundation of the relational data model and prescribes a consistent method for organizing, populating, computing, and querying data." readme = "README.md" -license = {file = "LICENSE.txt"} +license = {file = "LICENSE"} keywords = [ "database", "automated", @@ -62,7 +62,7 @@ classifiers = [ "Development Status :: 5 - Production/Stable", "Intended Audience :: Science/Research", "Intended Audience :: Healthcare Industry", - "License :: OSI Approved :: GNU Lesser General Public License v2 or later (LGPLv2+)", + "License :: OSI Approved :: Apache Software License", "Topic :: Software Development :: Libraries :: Python Modules", "Topic :: Scientific/Engineering", "Topic :: Scientific/Engineering :: Bio-Informatics", diff --git a/src/datajoint/__init__.py b/src/datajoint/__init__.py index 14c6ebd4e..7c72b71db 100644 --- a/src/datajoint/__init__.py +++ b/src/datajoint/__init__.py @@ -1,18 +1,18 @@ """ -DataJoint for Python is a framework for building data pipelines using MySQL databases -to represent pipeline structure and bulk storage systems for large objects. -DataJoint is built on the foundation of the relational data model and prescribes a -consistent method for organizing, populating, and querying data. +DataJoint for Python β€” a framework for scientific data pipelines. -The DataJoint data model is described in https://arxiv.org/abs/1807.11104 +DataJoint introduces the Relational Workflow Model, where your database schema +is an executable specification of your workflow. Tables represent workflow steps, +foreign keys encode dependencies, and computations are declarative. -DataJoint is free software under the LGPL License. In addition, we request -that any use of DataJoint leading to a publication be acknowledged in the publication. +Documentation: https://docs.datajoint.com +Source: https://github.com/datajoint/datajoint-python -Please cite: +Copyright 2014-2026 DataJoint Inc. and contributors. +Licensed under the Apache License, Version 2.0. - - http://biorxiv.org/content/early/2015/11/14/031658 - - http://dx.doi.org/10.1101/031658 +If DataJoint contributes to a publication, please cite: +https://doi.org/10.1101/031658 """ __author__ = "DataJoint Contributors"