Source code for obsplus.utils.dataset
"""
Simple utility for DataSet.
"""
import shutil
import tempfile
import textwrap
from contextlib import suppress
from pathlib import Path
from typing import Optional, Union
import obsplus
def _create_opsdata(opsdata_path: Path):
"""Create the directory to store obsplus datasets, add readme."""
opsdata_path = Path(opsdata_path)
# bail out early if the directory already exists
if opsdata_path.exists():
return
# else create the directory and add a readme.
opsdata_path.mkdir(parents=True, exist_ok=True)
readme_path = opsdata_path / "README.txt"
msg = textwrap.dedent(
"""
This directory contains the data sets curated by the obsplus python
package (github.com/niosh-mining/obsplus).
Each sub-directory contains a single data set and the data set's name is
the name of the directory. You can load the dataset using the
obsplus.load_dataset function and passing the name of the data set as a
string.
"""
)
with readme_path.open("w") as fi:
fi.write(msg)
[docs]
def copy_dataset(
dataset: Union[str, "obsplus.DataSet"],
destination: Optional[Union[str, Path]] = None,
) -> "obsplus.DataSet":
"""
Copy a dataset to a destination.
If the destination already exists simply do nothing.
Parameters
----------
dataset
The name of the dataset or a DataSet object.
destination
The destination to copy the dataset. It will be created if it
doesnt exist. If None is provided use tmpfile to create a temporary
directory.
Returns
-------
A new dataset object which refers to the copied files.
"""
dataset = obsplus.load_dataset(dataset)
expected_path: Path = dataset.data_path
assert expected_path.exists(), f"{expected_path} not yet downloaded"
# make destination paths and copy
if destination is None: # use a temp directory if none specified
dest_dir = Path(tempfile.mkdtemp())
else:
dest_dir = Path(destination)
dest_dir.mkdir(parents=True, exist_ok=True)
dest = dest_dir / dataset.name
with suppress(FileExistsError):
shutil.copytree(str(expected_path), str(dest))
# init new dataset of same class with updated base_path and return
return dataset.__class__(base_path=dest.parent)