diff --git a/.gitignore b/.gitignore index 8bd57da..fa75286 100644 --- a/.gitignore +++ b/.gitignore @@ -101,5 +101,6 @@ ENV/ # mypy .mypy_cache/ -# Example archive files +# Ignore application and data files and folders data/nyc-open-data/y43c-5n92/archive/ +.histore/ diff --git a/changelog.md b/changelog.md index 8ff5be3..6fd1a17 100644 --- a/changelog.md +++ b/changelog.md @@ -10,3 +10,11 @@ * Allow different types of input documents (e.g., CSV files or Json) * External merge-sort for large CSV files. * Add managers for maintaining sets of archives + + +### 0.1.2 - 06-25-2020 + +* Proper handling of date/time objects by the default archive reader and writer +* Optional arguments for Json encoder and decoder for persistent archives +* Add encoder and decoder information to archive manager metadata +* Simple command-line interface for persistent archive manager diff --git a/histore/archive/base.py b/histore/archive/base.py index f8594e1..026d861 100644 --- a/histore/archive/base.py +++ b/histore/archive/base.py @@ -305,7 +305,7 @@ class PersistentArchive(Archive): """ def __init__( self, basedir, replace=False, serializer=None, compression=None, - primary_key=None + primary_key=None, encoder=None, decoder=None ): """Initialize the associated archive store and the optional primary key columns that are used to generate row identifier. If no primary @@ -328,13 +328,19 @@ def __init__( compressed. the metadata file is always storesd as plain text. primary_key: string or list Column(s) that are used to generate identifier for snapshot rows. + encoder: json.JSONEncoder, default=None + Encoder used when writing archive rows as JSON objects to file. + decoder: func, default=None + Custom decoder function when reading archive rows from file. """ super(PersistentArchive, self).__init__( store=ArchiveFileStore( basedir=basedir, replace=replace, serializer=serializer, - compression=compression + compression=compression, + encoder=encoder, + decoder=decoder ), primary_key=primary_key ) diff --git a/histore/archive/manager/base.py b/histore/archive/manager/base.py index dcc1491..390ae29 100644 --- a/histore/archive/manager/base.py +++ b/histore/archive/manager/base.py @@ -23,7 +23,7 @@ def archives(self): Returns ------- - dict + dict(string: histore.archive.manager.descriptor.ArchiveDescriptor) """ raise NotImplementedError() @@ -42,7 +42,10 @@ def contains(self, identifier): return identifier in self.archives() @abstractmethod - def create(self, name=None, description=None, primary_key=None): + def create( + self, name=None, description=None, primary_key=None, encoder=None, + decoder=None + ): """Create a new archive object. Parameters @@ -54,6 +57,12 @@ def create(self, name=None, description=None, primary_key=None): primary_key: string or list, default=None Column(s) that are used to generate identifier for rows in the archive. + encoder: string, default=None + Full package path for the Json encoder class that is used by the + persistent archive. + decoder: string, default=None + Full package path for the Json decoder function that is used by the + persistent archive. Returns ------- @@ -84,7 +93,7 @@ def get(self, identifier): Returns ------- - histore.archive.vase.Archive + histore.archive.base.Archive Raises ------ @@ -92,6 +101,24 @@ def get(self, identifier): """ raise NotImplementedError() + def get_by_name(self, name): + """Get descriptor for the archive with the given name. If no archive + with that name exists None is returned. + + Parameters + ---------- + name: string + Archive name + + Returns + ------- + histore.archive.manager.descriptor.ArchiveDescriptor + """ + for archive in self.archives().values(): + if archive.name() == name: + return archive + return None + def list(self): """Get the list of descriptors for the maintained archives. @@ -100,3 +127,21 @@ def list(self): list(histore.archive.manager.descriptor.ArchiveDescriptor) """ return list(self.archives().values()) + + @abstractmethod + def rename(self, identifier, name): + """Rename the specified archive. Raises a ValueError if the identifier + is unknown or if an archive with the given name exist. + + Parameters + ---------- + identifier: string + Unique archive identifier + name: string + New archive name. + + Raises + ------ + ValueError + """ + raise NotImplementedError() diff --git a/histore/archive/manager/descriptor.py b/histore/archive/manager/descriptor.py index 66b7bb9..7ed404b 100644 --- a/histore/archive/manager/descriptor.py +++ b/histore/archive/manager/descriptor.py @@ -20,14 +20,17 @@ 'type': 'object', 'properties': { 'id': {'type': 'string'}, + 'createdAt': {'type': 'string'}, 'name': {'type': 'string'}, 'description': {'type': 'string'}, 'primaryKey': { 'type': 'array', 'items': {'type': 'string'} - } + }, + 'encoder': {'type': 'string'}, + 'decoder': {'type': 'string'} }, - 'required': ['id'] + 'required': ['id', 'createdAt'] } @@ -58,7 +61,10 @@ def __init__(self, doc, validate=True): jsonschema.validate(instance=doc, schema=DESCRIPTOR_SCHEMA) @staticmethod - def create(name=None, description=None, primary_key=None): + def create( + name=None, description=None, primary_key=None, encoder=None, + decoder=None + ): """Create a new archive descriptor object. Parameters @@ -70,6 +76,11 @@ def create(name=None, description=None, primary_key=None): primary_key: string or list, default=None Column(s) that are used to generate identifier for rows in the archive. + encoder: string, default=None + Full package path for the Json encoder class that is used by the + persistent archive. + decoder: string, default=None + Full package path for the Json decoder function that is used by the Returns ------- @@ -81,15 +92,37 @@ def create(name=None, description=None, primary_key=None): # Create a unique identifier for the new archive. identifier = util.get_unique_identifier() # Create the archive descriptor. - doc = {'id': identifier} + doc = {'id': identifier, 'createdAt': util.utc_now().isoformat()} if name is not None: doc['name'] = name if description is not None: doc['description'] = description if primary_key is not None: doc['primaryKey'] = primary_key + if encoder is not None: + doc['encoder'] = encoder + if decoder is not None: + doc['decoder'] = decoder return ArchiveDescriptor(doc) + def created_at(self): + """Get creating timestamp for the archive. + + Returns + ------- + datetime.datetime + """ + return util.to_datetime(self.doc.get('createdAt')) + + def decoder(self): + """Get package path for Json decoder used by persistent archives. + + Returns + ------- + string + """ + return self.doc.get('decoder') + def description(self): """Get archive description. If the value is not set in the descriptor an empty string is returned as default. @@ -100,6 +133,15 @@ def description(self): """ return self.doc.get('description', '') + def encoder(self): + """Get package path for Json encoder used by persistent archives. + + Returns + ------- + string + """ + return self.doc.get('encoder') + def identifier(self): """Get the unique archive identifier value. @@ -127,3 +169,13 @@ def primary_key(self): list(string) """ return self.doc.get('primaryKey') + + def rename(self, name): + """Update the name of the archive. + + Parameters + ---------- + name: string + New archive name. + """ + self.doc['name'] = name diff --git a/histore/archive/manager/fs.py b/histore/archive/manager/fs.py index b48fb65..b400b56 100644 --- a/histore/archive/manager/fs.py +++ b/histore/archive/manager/fs.py @@ -23,8 +23,8 @@ class PersistentArchiveManager(ArchiveManager): """The persistent archive manager maintains a set of archives on disk. The list of archive descriptors is also maintained on disk as a Json file. """ - def __init__(self, basedir=None): - """Initialize the base directory under which all archives are stores in + def __init__(self, basedir=None, exists=False): + """Initialize the base directory under which all archives are stored in individual sub-folders. If the base directory is not given the value will be read from the environment variable HISTORE_BASEDIR or the default value $HOME/.histore. @@ -35,15 +35,28 @@ def __init__(self, basedir=None): ---------- basedir: string Path to dorectory on disk where archives are maintained. + exists: bool, default=False + Raise an error if the given base directory is not empty or contains + the descriptor file. """ if basedir is None: basedir = config.BASEDIR() - self.basedir = util.createdir(basedir) # Initialize path to file that maintains archive descriptors. - self.descriptorfile = os.path.join(self.basedir, 'archives.json') + self.descriptorfile = os.path.join(basedir, 'archives.json') + exists_descriptorfile = os.path.isfile(self.descriptorfile) + # Raise error if the archive manager base directory is expected to + # exist. + if exists: + # Raise error if (i) the directory does not exists, or (ii) exists + # but does not contain the descriptor file. + if not os.path.isdir(basedir): + raise ValueError('archive manager does not exist') + elif os.listdir(basedir) and not exists_descriptorfile: + raise ValueError('archive manager does not exist') + self.basedir = util.createdir(basedir) # Initialize the internal cache of archive descriptors self._archives = dict() - if os.path.isfile(self.descriptorfile): + if exists_descriptorfile: with open(self.descriptorfile, 'r') as f: doc = json.load(f) for obj in doc: @@ -56,12 +69,16 @@ def archives(self): Returns ------- - dict + dict(string: histore.archive.manager.descriptor.ArchiveDescriptor) """ return self._archives - def create(self, name=None, description=None, primary_key=None): - """Create a new archive object. + def create( + self, name=None, description=None, primary_key=None, encoder=None, + decoder=None + ): + """Create a new archive object. Raises a ValueError if an archive with + the given name exists. Parameters ---------- @@ -72,23 +89,36 @@ def create(self, name=None, description=None, primary_key=None): primary_key: string or list, default=None Column(s) that are used to generate identifier for rows in the archive. + encoder: string, default=None + Full package path for the Json encoder class that is used by the + persistent archive. + decoder: string, default=None + Full package path for the Json decoder function that is used by the Returns ------- histore.archive.manager.descriptor.ArchiveDescriptor + + Raises + ------ + ValueError """ + # Ensure that the archive name is unique. + if self.get_by_name(name) is not None: + raise ValueError("archive '{}' already exists".format(name)) # Create the descriptor for the new archive. descriptor = ArchiveDescriptor.create( name=name, description=description, - primary_key=primary_key + primary_key=primary_key, + encoder=encoder, + decoder=decoder ) identifier = descriptor.identifier() primary_key = descriptor.primary_key() # Write list of archive descriptors. self._archives[identifier] = descriptor - with open(self.descriptorfile, 'w') as f: - json.dump([d.doc for d in self._archives.values()], f) + self.write() return descriptor def delete(self, identifier): @@ -100,10 +130,11 @@ def delete(self, identifier): Unique archive identifier """ if self.contains(identifier): - shutil.rmtree(os.path.join(self.basedir, identifier)) + archdir = os.path.join(self.basedir, identifier) + if os.path.isdir(archdir): + shutil.rmtree(archdir) del self._archives[identifier] - with open(self.descriptorfile, 'w') as f: - json.dump([d.doc for d in self._archives.values()], f) + self.write() def get(self, identifier): """Get the archive that is associated with the given identifier. Raises @@ -116,14 +147,65 @@ def get(self, identifier): Returns ------- - histore.archive.vase.Archive + histore.archive.base.Archive Raises ------ ValueError """ - if identifier not in self._archives: + desc = self._archives.get(identifier) + if desc is None: raise ValueError('unknown archive {}'.format(identifier)) archdir = os.path.join(self.basedir, identifier) - primary_key = self._archives[identifier].primary_key() - return PersistentArchive(basedir=archdir, primary_key=primary_key) + primary_key = desc.primary_key() + # Load JSONEncoder class if encoder is contained in the descriptor. + if desc.encoder() is not None: + encoder = util.import_obj(desc.encoder()) + else: + encoder = None + # Load the corresponding Json decoder function if a decoder is + # contained in the descriptor. + if desc.decoder() is not None: + decoder = util.import_obj(desc.decoder()) + else: + decoder = None + return PersistentArchive( + basedir=archdir, + primary_key=primary_key, + encoder=encoder, + decoder=decoder + ) + + def rename(self, identifier, name): + """Rename the specified archive. Raises a ValueError if the identifier + is unknown or if an archive with the given name exist. + + Parameters + ---------- + identifier: string + Unique archive identifier + name: string + New archive name. + + Raises + ------ + ValueError + """ + archive = self._archives.get(identifier) + if archive is None: + raise ValueError("unknown archive '{}''".format(identifier)) + if archive.name() == name: + # Do nothing if the archive name matches the new name. + return + # Raise an error if another archive with the new name exists. + # Ensure that the archive name is unique. + if self.get_by_name(name) is not None: + raise ValueError("archive '{}' already exists".format(name)) + archive.rename(name) + # Write list of archive descriptors. + self.write() + + def write(self): + """Write the current descriptor set to file.""" + with open(self.descriptorfile, 'w') as f: + json.dump([d.doc for d in self._archives.values()], f) diff --git a/histore/archive/manager/mem.py b/histore/archive/manager/mem.py index de979db..84b788f 100644 --- a/histore/archive/manager/mem.py +++ b/histore/archive/manager/mem.py @@ -28,12 +28,16 @@ def archives(self): Returns ------- - dict + dict(string: histore.archive.manager.descriptor.ArchiveDescriptor) """ return self._descriptors - def create(self, name=None, description=None, primary_key=None): - """Create a new volatiole archive object. + def create( + self, name=None, description=None, primary_key=None, encoder=None, + decoder=None + ): + """Create a new volatile archive object. Raises a ValueError if an + archive with the given name exists. Parameters ---------- @@ -44,11 +48,24 @@ def create(self, name=None, description=None, primary_key=None): primary_key: string or list, default=None Column(s) that are used to generate identifier for rows in the archive. + encoder: string, default=None + Ignored. Full package path for the Json encoder class that is used + by the persistent archive. Included for API completeness. + decoder: string, default=None + Ignored. Full package path for the Json decoder function that is + used by the persistent archive. Included for API completeness. Returns ------- histore.archive.manager.descriptor.ArchiveDescriptor + + Raises + ------ + ValueError """ + # Ensure that the archive name is unique. + if self.get_by_name(name) is not None: + raise ValueError("archive '{}' already exists".format(name)) # Create the descriptor for the new archive. descriptor = ArchiveDescriptor.create( name=name, @@ -84,12 +101,39 @@ def get(self, identifier): Returns ------- - histore.archive.vase.Archive + histore.archive.base.Archive Raises ------ ValueError """ if identifier not in self._archives: - raise ValueError('unknown archive {}'.format(identifier)) + raise ValueError("unknown archive '{}''".format(identifier)) return self._archives[identifier] + + def rename(self, identifier, name): + """Rename the specified archive. Raises a ValueError if the identifier + is unknown or if an archive with the given name exist. + + Parameters + ---------- + identifier: string + Unique archive identifier + name: string + New archive name. + + Raises + ------ + ValueError + """ + archive = self._descriptors.get(identifier) + if archive is None: + raise ValueError("unknown archive '{}''".format(identifier)) + if archive.name() == name: + # Do nothing if the archive name matches the new name. + return + # Raise an error if another archive with the new name exists. + # Ensure that the archive name is unique. + if self.get_by_name(name) is not None: + raise ValueError("archive '{}' already exists".format(name)) + archive.rename(name) diff --git a/histore/archive/row.py b/histore/archive/row.py index 4e22a99..13dd1d6 100644 --- a/histore/archive/row.py +++ b/histore/archive/row.py @@ -102,7 +102,7 @@ def at_version(self, version, columns, raise_error=True): value = self.cells.get(colid) if value is None: if raise_error: - raise ValueError('unknown version %d' % version) + raise ValueError('unknown column {}'.format(colid)) else: value = value.at_version(version, raise_error=raise_error) values.append(value) diff --git a/histore/archive/serialize/default.py b/histore/archive/serialize/default.py index e32e269..e275e1b 100644 --- a/histore/archive/serialize/default.py +++ b/histore/archive/serialize/default.py @@ -23,6 +23,11 @@ class DefaultSerializer(ArchiveSerializer): """Implementation of the archive object serializer. This is the default serializer used by HISTORE. + + The default serializer is configurable with respect to the labels that are + used in the serialization. Given labels should not start with the reserved + character '$' which is being used to encode the type of special values that + cannot be serialized by the default JSON encoder. """ def __init__( self, timestamp='t', pos='p', name='n', cells='c', value='v', @@ -32,6 +37,9 @@ def __init__( """Initialize the labels for elements used in the serialized objects. By default short labels are used to reduce storage overhead. + Passing a label that starts with the reserved character '$' will result + in a ValueError. + Parameters ---------- timestamp: string, default='t' @@ -58,19 +66,23 @@ def __init__( Element label for snapshot transaction time. description: string, default='d' Element label for snapshot descriptions. + + Raises + ------ + ValueError """ - self.timestamp = timestamp - self.pos = pos - self.name = name - self.cells = cells - self.value = value - self.key = key - self.rowid = rowid - self.colid = colid - self.version = version - self.valid_time = valid_time - self.transaction_time = transaction_time - self.description = description + self.timestamp = validate_label(timestamp) + self.pos = validate_label(pos) + self.name = validate_label(name) + self.cells = validate_label(cells) + self.value = validate_label(value) + self.key = validate_label(key) + self.rowid = validate_label(rowid) + self.colid = validate_label(colid) + self.version = validate_label(version) + self.valid_time = validate_label(valid_time) + self.transaction_time = validate_label(transaction_time) + self.description = validate_label(description) def deserialize_column(self, obj): """Get archive schema column instance from a serialized object. @@ -312,3 +324,30 @@ def serialize_value(self, value, ts): self.value: v.value } for v in value.values] return obj + + +# -- Helper Functions --------------------------------------------------------- + +def validate_label(label): + """Ensure that the given label does not start with the reserved '$' + character. + + Returns the given label if valid. Raises a ValueError if an invalid label + is given. + + Parameters + ---------- + label: string + Label for serialization of an archive component. + + Returns + ------- + string + + Raises + ------ + ValueError + """ + if label.startswith('$'): + raise ValueError("invalid label '{}'".format(label)) + return label diff --git a/histore/archive/snapshot.py b/histore/archive/snapshot.py index 8e48439..8b584b5 100644 --- a/histore/archive/snapshot.py +++ b/histore/archive/snapshot.py @@ -53,6 +53,16 @@ def __repr__(self): str(util.to_localtime(self.transaction_time)) ) + @property + def created_at(self): + """Shortcut for transaction timestamp. + + Returns + ------- + datetime.datetime + """ + return self.transaction_time + class SnapshotListing(object): """Listing of snapshot descriptors for snapshots in an archive. Ensures diff --git a/histore/archive/store/fs/base.py b/histore/archive/store/fs/base.py index ff6e6f4..cb81168 100644 --- a/histore/archive/store/fs/base.py +++ b/histore/archive/store/fs/base.py @@ -40,7 +40,8 @@ class ArchiveFileStore(ArchiveStore): faster access. """ def __init__( - self, basedir, replace=False, serializer=None, compression=None + self, basedir, replace=False, serializer=None, compression=None, + encoder=None, decoder=None ): """Initialize the archive archive components. @@ -58,10 +59,16 @@ def __init__( compression: string, default=None String representing the compression mode. Only te data file will be compressed. the metadata file is always storesd as plain text. + encoder: json.JSONEncoder, default=None + Encoder used when writing archive rows as JSON objects to file. + decoder: func, default=None + Custom decoder function when reading archive rows from file. """ self.basedir = util.createdir(basedir) self.serializer = serializer if serializer else DefaultSerializer() self.compression = compression + self.encoder = encoder + self.decoder = decoder # Initialize the file names self.datafile = os.path.join(self.basedir, 'rows.dat') self.metafile = os.path.join(self.basedir, 'metadata.dat') @@ -151,7 +158,8 @@ def get_reader(self): return ArchiveFileReader( filename=self.datafile, serializer=self.serializer, - compression=self.compression + compression=self.compression, + decoder=self.decoder ) def get_schema(self): @@ -186,5 +194,6 @@ def get_writer(self): filename=self.tmpdatafile, row_counter=self.row_counter, serializer=self.serializer, - compression=self.compression + compression=self.compression, + encoder=self.encoder ) diff --git a/histore/archive/store/fs/reader.py b/histore/archive/store/fs/reader.py index bc9ee23..3d0b49d 100644 --- a/histore/archive/store/fs/reader.py +++ b/histore/archive/store/fs/reader.py @@ -12,6 +12,9 @@ import json import os +from datetime import datetime +from dateutil.parser import isoparse + from histore.archive.reader import ArchiveReader from histore.archive.serialize.default import DefaultSerializer @@ -22,7 +25,9 @@ class ArchiveFileReader(ArchiveReader): """Reader for rows in a dataset archive. Reads rows in ascending order of their identifier. """ - def __init__(self, filename, serializer=None, compression=None): + def __init__( + self, filename, serializer=None, compression=None, decoder=None + ): """ Parameters ---------- @@ -33,16 +38,21 @@ def __init__(self, filename, serializer=None, compression=None): serialize rows that are written to file. compression: string, default=None String representing the compression mode for the output file. + decoder: func, default=None + Custom decoder function when reading archive rows from file. If not + given, the default decoder will be used. """ self.serializer = serializer if serializer else DefaultSerializer() + # Use the default decoder if None is given. + self.decoder = decoder if decoder is not None else default_decoder # The archive is empty if the given file does not exists. In this case - # the buffer will remain empty, + # the buffer will remain empty. self.buffer = None if os.path.isfile(filename): self.fin = util.inputstream(filename, compression=compression) # Read the first two lines in the output file. The first line is # expected to be '['. The second line is either ']' (for an empty - # archive) or contain the first row. + # archive) or it contains the first row. if self.fin.readline() != '[': raise ValueError('invalid input file {}'.format(filename)) self.next() @@ -85,6 +95,36 @@ def next(self): # element. if line.endswith(','): line = line[:-1] - self.buffer = self.serializer.deserialize_row(json.loads(line)) + # Decode and deserialize the archive row object. + obj = json.loads(line, object_hook=self.decoder) + self.buffer = self.serializer.deserialize_row(obj) # Return the previous buffer value as the result. return result + + +# -- Helper Functions --------------------------------------------------------- + +def default_decoder(obj): + """Default Json obkject decoder. Accounts for date types that have been + encoded as dictionaries. + + Parameters + ---------- + obj: dict + Json object that is being encountered by the Json reader + + Returns + ------- + dict + """ + if '$datetime' in obj: + return isoparse(obj['$datetime']) + elif '$date' in obj: + return isoparse(obj['$date']).date() + elif '$time' in obj: + val = obj['$time'] + if '.' in val: + return datetime.strptime(val, '%H:%M:%S.%f').time() + else: + return datetime.strptime(val, '%H:%M:%S').time() + return obj diff --git a/histore/archive/store/fs/writer.py b/histore/archive/store/fs/writer.py index 86cd502..806536b 100644 --- a/histore/archive/store/fs/writer.py +++ b/histore/archive/store/fs/writer.py @@ -12,6 +12,8 @@ import json import numpy as np +from datetime import datetime, date, time + from histore.key.base import KeyValue from histore.archive.serialize.default import DefaultSerializer from histore.archive.writer import ArchiveWriter @@ -26,7 +28,8 @@ class ArchiveFileWriter(ArchiveWriter): and close the array. """ def __init__( - self, filename, row_counter=0, serializer=None, compression=None + self, filename, row_counter=0, serializer=None, compression=None, + encoder=None ): """Initialize the output file, row counter, and the serializer that is being used. @@ -44,10 +47,14 @@ def __init__( serialize rows that are written to file. compression: string, default=None String representing the compression mode for the output file. + encoder: json.JSONEncoder, default=None + Encoder used when writing archive rows as JSON objects to file. """ super(ArchiveFileWriter, self).__init__(row_counter) # Use the default serializer if no serializer was given self.serializer = serializer if serializer else DefaultSerializer() + # Use the default JSONEncoder if no encoder is given + self.encoder = encoder if encoder is not None else DefaultEncoder # Open output file for writing. self.fout = util.outputstream(filename, compression=compression) # Write buffer to keep track of the last row that is being written. @@ -88,7 +95,7 @@ def write_buffer(self, row=None): # output that row or open the output array. if self.buffer is not None: text = self.serializer.serialize_row(self.buffer) - line = json.dumps(text, cls=NumpyEncoder) + line = json.dumps(text, cls=self.encoder) if row is not None: line += ',' else: @@ -100,21 +107,39 @@ def write_buffer(self, row=None): # -- Helper classes ----------------------------------------------------------- -class NumpyEncoder(json.JSONEncoder): - """Json encoder that handles numpy data types from pandas data frames. - - Based on https://stackoverflow.com/questions/50916422. +class DefaultEncoder(json.JSONEncoder): + """Json encoder that handles numpy data types from pandas data frames and + datetime objects. """ def default(self, obj): - """Convert numpy data types to default Python types.""" + """Convert numpy data types to default Python types Date objects are + converted to dictionaries using the class an label (prefixed by the + special character '$') and the ISO format string representation as the + value. + + Parameters + ---------- + obj: any + Value that is being encoded. + + Returns + ------- + any + """ if isinstance(obj, np.integer): return int(obj) elif isinstance(obj, np.floating): return float(obj) elif isinstance(obj, np.ndarray): return obj.tolist() + elif isinstance(obj, datetime): + return {'$datetime': obj.isoformat()} + elif isinstance(obj, date): + return {'$date': obj.isoformat()} + elif isinstance(obj, time): + return {'$time': obj.isoformat()} elif isinstance(obj, KeyValue): assert not obj.is_new() return obj.value else: - return super(NumpyEncoder, self).default(obj) + return super(DefaultEncoder, self).default(obj) diff --git a/histore/archive/timestamp.py b/histore/archive/timestamp.py index 538863a..74ab193 100644 --- a/histore/archive/timestamp.py +++ b/histore/archive/timestamp.py @@ -36,21 +36,12 @@ def __init__(self, start, end=None): def __repr__(self): """Unambiguous string representation of this time interval. - Returns - ------- - string - """ - return '[%i,%i]' % (self.start, self.end) - - def __str__(self): - """Readable string representation of this time interval. - Returns ------- string """ if self.start < self.end: - return '%i-%i' % (self.start, self.end) + return '{}-{}'.format(self.start, self.end) else: return str(self.start) @@ -110,9 +101,8 @@ def overlap(self, interval): """ if self.start <= interval.start: return self.end >= interval.start - elif self.start >= interval.start: + else: return interval.end >= self.start - return False class Timestamp(object): diff --git a/histore/archive/value.py b/histore/archive/value.py index 526b4f6..51c907a 100644 --- a/histore/archive/value.py +++ b/histore/archive/value.py @@ -22,7 +22,7 @@ class ArchiveValue(metaclass=ABCMeta): """The archive value represents the history of a cell in a tabular dataset. """ @abstractmethod - def at_version(self, version, raise_error=True): + def at_version(self, version, raise_error=True): # pragma: no cover """Get cell value for the given version. Raises ValueError if the cell does not have a value for the given version and the raise error flag is set tot True. If the flag is false None is returned instead. @@ -71,7 +71,7 @@ def diff(self, original_version, new_version): return None @abstractmethod - def extend(self, version, origin): + def extend(self, version, origin): # pragma: no cover """Extend the timestamp of the value that was valid at the given source version with the new version identifier. If no value was valid at the given version of origin the value is returned unchanged. @@ -105,7 +105,7 @@ def is_multi_version(self): return not self.is_single_version() @abstractmethod - def is_single_version(self): + def is_single_version(self): # pragma: no cover """Helper method to get the type of an archive value. Values can either be single version values or multi-version values. @@ -116,7 +116,7 @@ def is_single_version(self): raise NotImplementedError() @abstractmethod - def merge(self, value, version): + def merge(self, value, version): # pragma: no cover """Add value for the given version into the cell history. Returns a modified copy of the value. diff --git a/histore/cli/__init__.py b/histore/cli/__init__.py new file mode 100644 index 0000000..d319c41 --- /dev/null +++ b/histore/cli/__init__.py @@ -0,0 +1,6 @@ +# This file is part of the History Store (histore). +# +# Copyright (C) 2018-2020 New York University. +# +# The History Store (histore) is released under the Revised BSD License. See +# file LICENSE for full license details. diff --git a/histore/cli/archive.py b/histore/cli/archive.py new file mode 100644 index 0000000..4b8706c --- /dev/null +++ b/histore/cli/archive.py @@ -0,0 +1,187 @@ +# This file is part of the History Store (histore). +# +# Copyright (C) 2018-2020 New York University. +# +# The History Store (histore) is released under the Revised BSD License. See +# file LICENSE for full license details. + +"""Command line interface to interact with a manager for archives on the local +file system. +""" + +import click +import sys + +from histore.archive.manager.fs import PersistentArchiveManager + +import histore.config as config +import histore.util as util + + +"""Datetime format string.""" +DTF = '%Y-%m-%d %H:%M:%S' + + +# -- Create a new archive ----------------------------------------------------- + +@click.command(name='create') +@click.option( + '-b', '--basedir', + required=False, + type=click.Path(file_okay=False, dir_okay=True), + help='Base directory for archive files' +) +@click.option( + '-k', '--pk', + required=False, + type=str, + help='Comma-separate list of primary key columns' +) +@click.option( + '-c', '--comment', + required=False, + type=str, + help='Optional archive description' +) +@click.option( + '-e', '--encoder', + required=False, + type=str, + help='JSONEncoder class for the new archive' +) +@click.option( + '-d', '--decoder', + required=False, + type=str, + help='JSON decoder function for the new archive' +) +@click.argument('name') +def create_archive(basedir, pk, comment, encoder, decoder, name): + """Create a new archive.""" + manager = get_manager(basedir) + # Split primary key if it contains ','. + primary_key = pk.split(',') if pk is not None else None + try: + manager.create( + name=name, + description=comment, + primary_key=primary_key, + encoder=encoder, + decoder=decoder + ) + click.echo('Archive created!') + except ValueError as ex: + click.echo('{}'.format(ex)) + sys.exit(-1) + + +# -- Delete archive ----------------------------------------------------------- + +@click.command(name='delete') +@click.option( + '-b', '--basedir', + required=False, + type=click.Path(file_okay=False, dir_okay=True), + help='Base directory for archive files' +) +@click.option( + '-f', '--force', + is_flag=True, + default=False, + help='Delete without confirmation' +) +@click.argument('name') +def delete_archive(basedir, force, name): + """Delete existing archive.""" + manager = get_manager(basedir) + for archive in manager.list(): + if archive.name() == name: + if not force: + msg = "This will remove archive '{}' permanently" + click.echo(msg.format(name)) + click.confirm('Continue?', default=True, abort=True) + manager.delete(archive.identifier()) + click.echo("Archive '{}' deleted!".format(name)) + return + click.echo("Unknown archive '{}'.".format(name)) + sys.exit(-1) + + +# -- List archives ------------------------------------------------------------ + +@click.command(name='list') +@click.option( + '-b', '--basedir', + required=False, + type=click.Path(file_okay=False, dir_okay=True), + help='Base directory for archive files' +) +@click.option( + '-d', '--bydate', + is_flag=True, + default=False, + help='Sort by creation date' +) +def list_archives(basedir, bydate): + """List names of existing archives.""" + manager = get_manager(basedir) + archives = manager.list() + if bydate: + archives = sorted(archives, key=lambda a: a.created_at()) + else: + archives = sorted(archives, key=lambda a: a.name()) + click.echo() + click.echo('Archives') + click.echo('--------') + for archive in archives: + ts = util.to_localtime(archive.created_at()) + click.echo('{} (created at {})'.format( + archive.name(), + ts.strftime('%Y-%m-%d %H:%M:%S') + )) + click.echo() + + +# -- Rename archive ----------------------------------------------------------- + +@click.command(name='rename') +@click.option( + '-b', '--basedir', + required=False, + type=click.Path(file_okay=False, dir_okay=True), + help='Base directory for archive files' +) +@click.argument('oldname') +@click.argument('newname') +def rename_archive(basedir, oldname, newname): + """Rename existing archive.""" + manager = get_manager(basedir) + # Get archive with the old name + archive = manager.get_by_name(oldname) + if archive is None: + click.echo("Unknown archive '{}'".format(oldname)) + sys.exit(-1) + try: + manager.rename(archive.identifier(), newname) + except ValueError as ex: + click.echo('{}'.format(ex)) + sys.exit(-1) + + +# -- Helper Functions --------------------------------------------------------- + +def get_manager(basedir): + """Create instance of persistent archive manager assuming that it has been + initialized before. + + Parameters + ---------- + basedir: string + Base directory for the archive manager. + + Returns + ------- + histore.archive.manager.fs.PersistentArchiveManager + """ + basedir = basedir if basedir is not None else config.BASEDIR() + return PersistentArchiveManager(basedir=basedir, exists=True) diff --git a/histore/cli/base.py b/histore/cli/base.py new file mode 100644 index 0000000..72a81af --- /dev/null +++ b/histore/cli/base.py @@ -0,0 +1,65 @@ +# This file is part of the History Store (histore). +# +# Copyright (C) 2018-2020 New York University. +# +# The History Store (histore) is released under the Revised BSD License. See +# file LICENSE for full license details. + +"""Command line interface to interact with a manager for archives on the local +file system. +""" + +import click +import os +import sys + +from histore.archive.manager.fs import PersistentArchiveManager +from histore.cli.archive import ( + create_archive, delete_archive, list_archives, rename_archive +) +from histore.cli.snapshot import ( + checkout_snapshot, commit_snapshot, list_snapshots +) + +import histore.config as config + + +# -- Init the archive manager ------------------------------------------------- + +@click.command(name='init') +@click.option( + '-b', '--basedir', + required=False, + type=click.Path(file_okay=False, dir_okay=True), + help='Base directory for archive files' +) +def init_manager(basedir): + """Initialize the archive manager directory.""" + # Test if the base directory exists and is empty. + basedir = basedir if basedir is not None else config.BASEDIR() + if os.path.isdir(basedir): + if os.listdir(basedir): + click.echo('Not an empty directory {}.'.format(basedir)) + sys.exit(-1) + # Create instance of persistent archive manager to setup directories and + # files. + PersistentArchiveManager(basedir=basedir, exists=False) + click.echo("Initialized in {}.".format(os.path.abspath(basedir))) + + +# -- Create command group ----------------------------------------------------- + +@click.group() +def cli(): # pragma: no cover + """Command line interface for HISTORE archive manager.""" + pass + + +cli.add_command(init_manager) +cli.add_command(checkout_snapshot) +cli.add_command(commit_snapshot) +cli.add_command(create_archive) +cli.add_command(delete_archive) +cli.add_command(list_archives) +cli.add_command(list_snapshots) +cli.add_command(rename_archive) diff --git a/histore/cli/snapshot.py b/histore/cli/snapshot.py new file mode 100644 index 0000000..ee61fbf --- /dev/null +++ b/histore/cli/snapshot.py @@ -0,0 +1,211 @@ +# This file is part of the History Store (histore). +# +# Copyright (C) 2018-2020 New York University. +# +# The History Store (histore) is released under the Revised BSD License. See +# file LICENSE for full license details. + +"""Command line interface to interact with a manager for archives on the local +file system. +""" + +import csv +import click +import pandas as pd +import sys + +from histore.cli.archive import get_manager + +import histore.util as util + + +# -- Create a new archive ----------------------------------------------------- + +@click.command(name='commit') +@click.option( + '-b', '--basedir', + required=False, + type=click.Path(file_okay=False, dir_okay=True), + help='Base directory for archive files' +) +@click.option( + '-c', '--comment', + required=False, + type=str, + help='Optional snapshot description' +) +@click.option( + '-d', '--delimiter', + required=False, + type=str, + help='One-character string used to separate fields' +) +@click.option( + '-q', '--quotechar', + required=False, + type=str, + help='One-character string used to quote fields with special characters' +) +@click.option( + '-z', '--gzip', + is_flag=True, + default=False, + help='Gzip compressed' +) +@click.argument( + 'archive', + type=str +) +@click.argument( + 'filename', + type=click.Path(file_okay=True, dir_okay=False, exists=True) +) +def commit_snapshot( + basedir, comment, delimiter, quotechar, gzip, archive, filename +): + """Commit file to archive.""" + store = get_archive(basedir, archive) + if store is None: + click.echo("Unknown archive '{}'".format(archive)) + sys.exit(-1) + # Read the data frame. + df = pd.read_csv( + filename, + delimiter=get_delimiter(delimiter), + quotechar=quotechar if quotechar is not None else '"', + quoting=csv.QUOTE_MINIMAL, + compression='gzip' if gzip else None + ) + s = store.commit(df, description=comment) + click.echo('Snapshot {} created.'.format(s.version)) + + +# -- Checkout snapshot -------------------------------------------------------- + +@click.command(name='checkout') +@click.option( + '-b', '--basedir', + required=False, + type=click.Path(file_okay=False, dir_okay=True), + help='Base directory for archive files' +) +@click.option( + '-d', '--delimiter', + required=False, + type=str, + help='One-character string used to separate fields' +) +@click.option( + '-q', '--quotechar', + required=False, + type=str, + help='One-character string used to quote fields with special characters' +) +@click.option( + '-z', '--gzip', + is_flag=True, + default=False, + help='Gzip compressed' +) +@click.option( + '-v', '--version', + required=False, + type=int, + help='Snapshot version' +) +@click.argument( + 'archive', + type=str +) +@click.argument( + 'filename', + type=click.Path(file_okay=True, dir_okay=False) +) +def checkout_snapshot( + basedir, delimiter, quotechar, gzip, version, archive, filename +): + """Write snapshot to file.""" + store = get_archive(basedir, archive) + if store is None: + click.echo("Unknown archive '{}'".format(archive)) + sys.exit(-1) + # Read the snapshot data frame. + df = store.checkout(version=version) + df.to_csv( + filename, + sep=get_delimiter(delimiter), + quotechar=quotechar if quotechar is not None else '"', + quoting=csv.QUOTE_MINIMAL, + compression='gzip' if gzip else None, + index=False + ) + + +# -- List snapshots ----------------------------------------------------------- + +@click.command(name='snapshots') +@click.option( + '-b', '--basedir', + required=False, + type=click.Path(file_okay=False, dir_okay=True), + help='Base directory for archive files' +) +@click.argument( + 'archive', + type=str +) +def list_snapshots(basedir, archive): + """List snapshots.""" + store = get_archive(basedir, archive) + if store is None: + click.echo("Unknown archive '{}'".format(archive)) + sys.exit(-1) + for s in store.snapshots(): + click.echo('{}\t{}\t{}'.format( + s.version, + util.to_localtime(s.created_at).strftime('%Y-%m-%d %H:%M:%S'), + s.description + )) + + +# -- Helper Function ---------------------------------------------------------- + +def get_archive(basedir, name): + """Get handle for archive with given name. Returns None if no archive with + the given name exists. + + Parameters + ---------- + basedir: string + Base directory for the archive manager. + name: string + Unique archive name. + + Returns + ------- + histore.archive.base.archive + """ + manager = get_manager(basedir) + descriptor = manager.get_by_name(name) + if descriptor is None: + return None + return manager.get(descriptor.identifier()) + + +def get_delimiter(delimiter): + """Get delimiter. Replace encodings for tabulator with tab character. + + Parameters + ---------- + delimiter: string + One-character used to separate fields. + + Returns + ------- + string + """ + if delimiter is None: + return ',' + if delimiter.lower() in ['tab', '\\t']: + return '\t' + return delimiter diff --git a/histore/tests/encode.py b/histore/tests/encode.py new file mode 100644 index 0000000..5e9b8c9 --- /dev/null +++ b/histore/tests/encode.py @@ -0,0 +1,30 @@ +# This file is part of the History Store (histore). +# +# Copyright (C) 2018-2020 New York University. +# +# The History Store (histore) is released under the Revised BSD License. See +# file LICENSE for full license details. + +"""Test encoders and decoders for Json objects.""" + +import json + +from datetime import datetime + + +class TestEncoder(json.JSONEncoder): + """Json encoder that handles datetime objects.""" + def default(self, obj): + """Convert datatime to dictionary.""" + if isinstance(obj, datetime): + return {'$dt': obj.isoformat()} + return obj.value # Assumes histore.key.base.KeyValue + + +def test_decoder(obj): + """Decode objects generated by the TestEncoder. Returns datetime object + as string instead of datetime instances. + """ + if '$dt' in obj: + return obj['$dt'] + return obj diff --git a/histore/util.py b/histore/util.py index 5189c67..d381611 100644 --- a/histore/util.py +++ b/histore/util.py @@ -11,6 +11,7 @@ from datetime import datetime from dateutil.parser import isoparse from dateutil.tz import UTC +from importlib import import_module import errno import gzip @@ -68,6 +69,28 @@ def utc_now(): return datetime.now(UTC) +# -- Dynamic import ---------------------------------------------------------- + +def import_obj(import_path): + """Import an object (function or class) from a given package path. + + Parameters + ---------- + import_path: string + Full package target path for the imported object. Assumes that path + components are separated by '.'. + + Returns + ------- + any + """ + pos = import_path.rfind('.') + module_name = import_path[:pos] + class_name = import_path[pos+1:] + module = import_module(module_name) + return getattr(module, class_name) + + # -- I/O ---------------------------------------------------------------------- def createdir(directory, abs=False): diff --git a/histore/version.py b/histore/version.py index 831bb7d..f969437 100644 --- a/histore/version.py +++ b/histore/version.py @@ -6,4 +6,4 @@ # file LICENSE for full license details. """Code version information for histore.""" -__version__ = '0.1.1' +__version__ = '0.1.2' diff --git a/requirements.txt b/requirements.txt index a238c1a..9cf0ea3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,4 @@ jsonschema>=3.2.0 python-dateutil pyyaml psutil +Click>=7.0 diff --git a/setup.py b/setup.py index 97e6e41..182bb24 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,8 @@ 'jsonschema>=3.2.0', 'python-dateutil', 'pyyaml', - 'psutil' + 'psutil', + 'Click>=7.0' ] @@ -73,6 +74,11 @@ extras_require=extras_require, tests_require=tests_require, install_requires=install_requires, + entry_points={ + 'console_scripts': [ + 'histore = histore.cli.base:cli', + ] + }, classifiers=[ 'License :: OSI Approved :: BSD License', 'Operating System :: OS Independent', diff --git a/tests/archive/manager/test_archive_descriptor.py b/tests/archive/manager/test_archive_descriptor.py index 0494d8f..56a86c7 100644 --- a/tests/archive/manager/test_archive_descriptor.py +++ b/tests/archive/manager/test_archive_descriptor.py @@ -13,22 +13,29 @@ from histore.archive.manager.descriptor import ArchiveDescriptor +import histore.util as util + def test_archive_descriptor(): """Test methods for creating archive descriptors.""" # Create descriptor using a dictionary. doc = { 'id': '0000', + 'createdAt': util.utc_now().isoformat(), 'name': 'My Archive', 'description': 'This is my archive', - 'primaryKey': ['SSN'] + 'primaryKey': ['SSN'], + 'encoder': 'myencoder', + 'decoder': 'mydecoder' } descriptor = ArchiveDescriptor(doc) assert descriptor.identifier() == '0000' assert descriptor.name() == 'My Archive' assert descriptor.description() == 'This is my archive' assert descriptor.primary_key() == ['SSN'] - doc = {'id': '0001'} + assert descriptor.encoder() == 'myencoder' + assert descriptor.decoder() == 'mydecoder' + doc = {'id': '0001', 'createdAt': util.utc_now().isoformat()} descriptor = ArchiveDescriptor(doc) assert descriptor.identifier() == '0001' assert descriptor.name() == '0001' @@ -38,12 +45,16 @@ def test_archive_descriptor(): descriptor = ArchiveDescriptor.create( name='My Archive', description='This is my archive', - primary_key='SSN' + primary_key='SSN', + encoder='myencoder', + decoder='mydecoder' ) assert descriptor.identifier() is not None assert descriptor.name() == 'My Archive' assert descriptor.description() == 'This is my archive' assert descriptor.primary_key() == ['SSN'] + assert descriptor.encoder() == 'myencoder' + assert descriptor.decoder() == 'mydecoder' descriptor = ArchiveDescriptor.create() assert descriptor.identifier() is not None assert descriptor.name() == descriptor.identifier() diff --git a/tests/archive/manager/test_persistent_manager.py b/tests/archive/manager/test_persistent_manager.py index e29e626..33c0210 100644 --- a/tests/archive/manager/test_persistent_manager.py +++ b/tests/archive/manager/test_persistent_manager.py @@ -8,8 +8,11 @@ """Unit tests for the archive descriptor.""" import os +import pandas as pd import pytest +from datetime import datetime + from histore.archive.manager.fs import PersistentArchiveManager import histore.config as config @@ -40,7 +43,7 @@ def test_persistent_archive_manager(tmpdir): manager.get('unknown') # Reload the archive manager. os.environ[config.ENV_HISTORE_BASEDIR] = str(tmpdir) - manager = PersistentArchiveManager() + manager = PersistentArchiveManager(exists=True) assert len(manager.archives()) == 1 archive = manager.get(descriptor.identifier()) assert archive is not None @@ -53,3 +56,42 @@ def test_persistent_archive_manager(tmpdir): manager.get(descriptor.identifier()) # Cleanup the environment del os.environ[config.ENV_HISTORE_BASEDIR] + # Error case when using the exist flag. + with pytest.raises(ValueError): + PersistentArchiveManager( + basedir=os.path.join(str(tmpdir), 'ABC'), + exists=True + ) + + +def test_default_json_encoder(tmpdir): + """Test persistent archives with default Json encoder.""" + # Use the default encoder and decoder. + manager = PersistentArchiveManager(basedir=str(tmpdir)) + descriptor = manager.create(name='Archive') + archive = manager.get(descriptor.identifier()) + dt = datetime.now() + archive.commit(pd.DataFrame(data=[[dt]])) + df = archive.checkout() + assert df.shape == (1, 1) + assert df.iloc[0][0] == dt + assert isinstance(df.iloc[0][0], datetime) + + +def test_custom_json_encoder(tmpdir): + """Test persistent archives with custom Json encoder.""" + # Use the default encoder and decoder. + manager = PersistentArchiveManager(basedir=str(tmpdir)) + descriptor = manager.create( + name='Archive', + encoder='histore.tests.encode.TestEncoder', + decoder='histore.tests.encode.test_decoder' + ) + manager = PersistentArchiveManager(basedir=str(tmpdir)) + archive = manager.get(descriptor.identifier()) + dt = datetime.now() + archive.commit(pd.DataFrame(data=[[dt, 'A']])) + df = archive.checkout() + assert df.shape == (1, 2) + assert df.iloc[0][0] == dt.isoformat() + assert isinstance(df.iloc[0][0], str) diff --git a/tests/archive/manager/test_volatile_manager.py b/tests/archive/manager/test_volatile_manager.py index 24b9c51..e2ff4b3 100644 --- a/tests/archive/manager/test_volatile_manager.py +++ b/tests/archive/manager/test_volatile_manager.py @@ -16,6 +16,7 @@ def test_volatile_archive_manager(): """Test functionality of the volatile archive manager.""" manager = VolatileArchiveManager() assert len(manager.archives()) == 0 + # Create archive descriptor = manager.create( name='First archive', description='My first archive', @@ -26,6 +27,10 @@ def test_volatile_archive_manager(): assert descriptor.description() == 'My first archive' assert descriptor.primary_key() == ['SSN'] assert len(manager.archives()) == 1 + # Create archive with existing name. + with pytest.raises(ValueError): + manager.create(name='First archive') + # List archive(s) descriptor = manager.list()[0] assert descriptor.identifier() is not None assert descriptor.name() == 'First archive' @@ -33,10 +38,23 @@ def test_volatile_archive_manager(): assert descriptor.primary_key() == ['SSN'] archive = manager.get(descriptor.identifier()) assert archive is not None + # Rename the archive. + manager.rename(descriptor.identifier(), 'Some archive') + assert manager.get_by_name('My first archive') is None + assert manager.get_by_name('Some archive') is not None + # No error when archive name is identical to new name + manager.rename(descriptor.identifier(), 'Some archive') + # Error when renaming an unknown archive. + with pytest.raises(ValueError): + manager.rename('unknown', 'My archive') + # Error when renaming to an existing archive. + manager.create(name='First archive') + with pytest.raises(ValueError): + manager.rename(descriptor.identifier(), 'First archive') # Delete the archive manager.delete(descriptor.identifier()) - assert len(manager.archives()) == 0 + assert len(manager.archives()) == 1 manager.delete(descriptor.identifier()) - # Error cases + # Error when accessing non-existing archive with pytest.raises(ValueError): manager.get(descriptor.identifier()) diff --git a/tests/archive/store/test_buffered_reader.py b/tests/archive/store/test_buffered_reader.py new file mode 100644 index 0000000..7f3950a --- /dev/null +++ b/tests/archive/store/test_buffered_reader.py @@ -0,0 +1,29 @@ +# This file is part of the History Store (histore). +# +# Copyright (C) 2018-2020 New York University. +# +# The History Store (histore) is released under the Revised BSD License. See +# file LICENSE for full license details. + +"""Unit tests for the buffered archive reader.""" + +import pandas as pd + +from histore.archive.base import Archive + + +def test_archive_reader_iterate(tmpdir): + """Test reading rows in an archive using the row iterator provided by the + archive reader. + """ + # Create archive in main memory + archive = Archive() + # First snapshot + df = pd.DataFrame( + data=[['Alice', 32], ['Bob', 45], ['Claire', 27], ['Alice', 23]], + columns=['Name', 'Age'] + ) + archive.commit(df) + reader = archive.reader() + for row in reader: + assert row.rowid >= 0 diff --git a/tests/archive/store/test_json_encode.py b/tests/archive/store/test_json_encode.py new file mode 100644 index 0000000..bcf93a9 --- /dev/null +++ b/tests/archive/store/test_json_encode.py @@ -0,0 +1,41 @@ +# This file is part of the History Store (histore). +# +# Copyright (C) 2018-2020 New York University. +# +# The History Store (histore) is released under the Revised BSD License. See +# file LICENSE for full license details. + +"""Unit test for the default JSON encoder and decoder.""" + +import json +import numpy as np + +from datetime import datetime, time + +from histore.archive.store.fs.reader import default_decoder +from histore.archive.store.fs.writer import DefaultEncoder + + +def test_datetime_objects(): + """Test encoding and decoding date time objects.""" + dt = datetime.now() + d = dt.date() + t = dt.time() + doc = {'a': 'X', 'b': dt, 'c': d, 'd': t, 'e': time(11, 15)} + obj = json.dumps(doc, cls=DefaultEncoder) + doc = json.loads(obj, object_hook=default_decoder) + assert doc['a'] == 'X' + assert doc['b'] == dt + assert doc['c'] == d + assert doc['d'] == t + assert doc['e'] == time(11, 15) + + +def test_numpy_objects(): + """Test encoding numpy objects.""" + doc = {'a': 'X', 'b': np.int(1), 'c': np.float(1.2), 'd': np.array([1, 2])} + doc = json.loads(json.dumps(doc, cls=DefaultEncoder)) + assert doc['a'] == 'X' + assert doc['b'] == 1 + assert doc['c'] == 1.2 + assert doc['d'] == [1, 2] diff --git a/tests/archive/test_archive_cases.py b/tests/archive/test_archive_cases.py new file mode 100644 index 0000000..b1217ff --- /dev/null +++ b/tests/archive/test_archive_cases.py @@ -0,0 +1,45 @@ +# This file is part of the History Store (histore). +# +# Copyright (C) 2018-2020 New York University. +# +# The History Store (histore) is released under the Revised BSD License. See +# file LICENSE for full license details. + +"""Unit tests for special (error) cases for the archive object.""" + +import pandas as pd +import pytest + +from histore.archive.base import Archive +from histore.document.mem.dataframe import DataFrameDocument + + +def test_special_checkout_cases(): + """Test various special (error) cases for commit and checkout.""" + archive = Archive() + # Version 0 + df = pd.DataFrame( + data=[['Alice', 1], ['Bob', 1], ['Claire', 1], ['Alice', 2]], + columns=['Name', 'Index'] + ) + s = archive.commit(DataFrameDocument(df)) + assert s.version == 0 + # Last version is checked out by default. + df = archive.checkout(version=0) + assert list(df.index) == [0, 1, 2, 3] + df = archive.checkout() + assert list(df.index) == [0, 1, 2, 3] + # Partial merge with given origin. + df = pd.DataFrame(data=[['Alice', 1]], columns=df.columns) + s = archive.commit(df, origin=0) + assert s.version == 1 + # Checkout an unknown version. + with pytest.raises(ValueError): + archive.checkout(version=10) + + +def test_partial_commit_to_empty_archive(): + """Test error when committing a partial snapshot to an empty archive.""" + df = pd.DataFrame(data=[[1]]) + with pytest.raises(ValueError): + Archive().commit(df, partial=True) diff --git a/tests/archive/test_archive_row.py b/tests/archive/test_archive_row.py index 8927421..7587424 100644 --- a/tests/archive/test_archive_row.py +++ b/tests/archive/test_archive_row.py @@ -67,6 +67,14 @@ def test_extend_archive_row(): cells=dict(), timestamp=ts ) + assert str(row) == '\n'.join([ + '' + ]) row = row.merge(pos=1, values={1: 'A', 2: 1, 3: 'a'}, version=2) row = row.merge(pos=1, values={1: 'B', 2: 1, 3: 'b'}, version=3) row = row.extend(version=4, origin=2) @@ -74,8 +82,15 @@ def test_extend_archive_row(): assert pos == 1 assert values == ['A', 1, 'a'] row = row.extend(version=5, origin=1) + # Error for unknown version. with pytest.raises(ValueError): - row.at_version(version=5, columns=[1, 2, 3]) + row.at_version(version=5, columns=[1, 2]) + # Error for unknown column. + with pytest.raises(ValueError): + row.at_version(version=4, columns=[1, 5], raise_error=True) + pos, values = row.at_version(version=4, columns=[1, 5], raise_error=False) + assert pos == 1 + assert values == ['A', None] pos, values = row.at_version( version=5, columns=[1, 2, 3], @@ -171,6 +186,7 @@ def test_row_provenance(): }), timestamp=Timestamp(intervals=TimeInterval(1, 5)) ) + assert row.diff(0, 0) is None assert row.diff(0, 1).is_insert() assert row.diff(5, 6).is_delete() prov = row.diff(1, 2) diff --git a/tests/archive/test_archive_serializer.py b/tests/archive/test_archive_serializer.py index 1d14670..1c66f5d 100644 --- a/tests/archive/test_archive_serializer.py +++ b/tests/archive/test_archive_serializer.py @@ -22,6 +22,14 @@ import histore.util as util +def test_invalid_label(): + """Test error when creating a serializer with an invalid label value.""" + s = DefaultSerializer(timestamp='ts') + assert s.timestamp == 'ts' + with pytest.raises(ValueError): + DefaultSerializer(timestamp='$ts') + + def test_serialize_column(): """Test (de-)serialization of archive schema columns.""" serializer = DefaultSerializer() @@ -80,6 +88,15 @@ def test_serialize_snapshot(): assert s.valid_time == snapshot.valid_time assert s.transaction_time == snapshot.transaction_time assert s.description == snapshot.description + # For completeness, test deserializing a snapshot object without + # description element. + snapshot.description = None + obj = serializer.serialize_snapshot(snapshot) + s = serializer.deserialize_snapshot(obj) + assert s.version == snapshot.version + assert s.valid_time == snapshot.valid_time + assert s.transaction_time == snapshot.transaction_time + assert s.description == '' # -- Snapshot with description -------------------------------------------- snapshot = Snapshot(0, valid_time=vt, description='First snapshot') obj = serializer.serialize_snapshot(snapshot) @@ -124,6 +141,9 @@ def test_serialize_value(): value = serializer.deserialize_value(obj=obj, ts=ts) assert value.timestamp.is_equal(Timestamp(version=1)) assert value.value == 1 + # Error case for invalid object + with pytest.raises(ValueError): + serializer.deserialize_value(obj='A', ts=ts) # -- Multi-version value -------------------------------------------------- value = SingleVersionValue(value=1, timestamp=Timestamp(version=1)) value = value.merge(value='A', version=2) diff --git a/tests/archive/test_snapshot.py b/tests/archive/test_snapshot.py index 2eeb2dd..14fdb3e 100644 --- a/tests/archive/test_snapshot.py +++ b/tests/archive/test_snapshot.py @@ -24,6 +24,7 @@ def test_append_snapshots(): assert snapshots.last_snapshot() is not None s = snapshots.last_snapshot() assert s.version == 0 + assert str(s).startswith('