Skip to content

Commit

Permalink
Merge pull request #12 from heikomuller/dev0.1.4
Browse files Browse the repository at this point in the history
Dev0.1.4
  • Loading branch information
heikomuller authored Oct 7, 2020
2 parents f9cce73 + c9bb602 commit 4df4b00
Show file tree
Hide file tree
Showing 8 changed files with 66 additions and 12 deletions.
5 changes: 5 additions & 0 deletions changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,8 @@
### 0.1.3 - 2020-10-05

* Add archive manager that maintains descriptors in a relational database (\#8)


### 0.1.4 - 2020-10-07

* Add index position information to column class (\#11)
4 changes: 3 additions & 1 deletion histore/archive/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,9 @@ def at_version(self, version):
# Sort columns based on their position and return a list of snapshot
# columns.
cols.sort(key=lambda x: x[2])
return [Column(colid=id, name=name) for id, name, _ in cols]
return [
Column(colid=id, name=name, colidx=pos) for id, name, pos in cols
]

def merge(
self, columns, version, matching=MATCH_IDNAME, renamed=None,
Expand Down
11 changes: 9 additions & 2 deletions histore/document/mem/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,9 +87,16 @@ def __init__(self, doc, validate=True):
columns = list()
for obj in doc['columns']:
if isinstance(obj, dict):
columns.append(Column(colid=obj['id'], name=obj['name']))
col = Column(
colid=obj['id'],
name=obj['name'],
colidx=len(columns)
)
else:
columns.append(obj)
# Assumes that the object is a scalar value (string or number)
# representing the column name.
col = Column(colid=-1, name=obj, colidx=len(columns))
columns.append(col)
# Get the document rows.
rows = doc['data']
# Create the keys for the document rows.
Expand Down
20 changes: 15 additions & 5 deletions histore/document/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,21 @@
column value in a Pandas data frame.
"""

from typing import List, Optional, Union

"""Type alias for column lists."""
Columns = Union[str, int, List[Union[int, str]]]


class Column(str):
"""Columns in openclean data frames are subclasses of Python strings that
"""Columns in histore data frames are subclasses of Python strings that
contain a unique column identifier. This implementation is based on:
https://bytes.com/topic/python/answers/32098-my-experiences-subclassing-string
The order of creation is that the __new__ method is called which returns
the object then __init__ is called.
"""
def __new__(cls, colid, name, *args, **keywargs):
def __new__(cls, colid: int, name: str, colidx: Optional[int] = None):
"""Initialize the String object with the given column name. Ignore the
column identifier.
Expand All @@ -29,10 +34,12 @@ def __new__(cls, colid, name, *args, **keywargs):
Unique column identifier
name: string
Column name
colidx: int, default=None
Index position of the column in a dataset schema.
"""
return str.__new__(cls, str(name))

def __init__(self, colid, name):
def __init__(self, colid: int, name: str, colidx: Optional[int] = None):
"""Initialize the unique column identifier. The column name has already
been initialized by the __new__ method that is called prior to the
__init__ method.
Expand All @@ -43,14 +50,17 @@ def __init__(self, colid, name):
Unique column identifier
name: string
Column name
colidx: int, default=None
Index position of the column in a dataset schema.
"""
self.colid = colid
self.colidx = colidx


# -- Helper methods -----------------------------------------------------------


def column_index(schema, columns):
def column_index(schema: List[str], columns: Columns):
"""Get the list of column index positions in a given schema (list of
column names). Columns are either specified by name or by index position.
Expand All @@ -63,7 +73,7 @@ def column_index(schema, columns):
----------
schema: list(string)
List of column names.
columns: list(int or str)
columns: int, str, or list(int or str)
List of column index positions or column names.
Returns
Expand Down
2 changes: 1 addition & 1 deletion histore/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@
# file LICENSE for full license details.

"""Code version information for histore."""
__version__ = '0.1.3'
__version__ = '0.1.4'
8 changes: 7 additions & 1 deletion tests/archive/manager/test_persistent_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from histore.archive.manager.db.database import DB, TEST_URL
from histore.archive.manager.fs import FileSystemArchiveManager
from histore.archive.manager.persist import PersistentArchiveManager
from histore.document.schema import Column

import histore.config as config

Expand All @@ -36,7 +37,7 @@ def test_create_archive(ManagerCls, kwargs, tmpdir):
# -- Create empty manager instance ----------------------------------------
manager = ManagerCls(**kwargs)
assert len(manager.archives()) == 0
# -- Ad first archive -----------------------------------------------------
# -- Add irst archive -----------------------------------------------------
descriptor = manager.create(
name='First archive',
description='My first archive',
Expand Down Expand Up @@ -93,6 +94,11 @@ def test_encoder_default(ManagerCls, kwargs, tmpdir):
assert df.shape == (1, 1)
assert df.iloc[0][0] == dt
assert isinstance(df.iloc[0][0], datetime)
# DataFrane schema
for col in df.columns:
assert isinstance(col, Column)
assert col.colid >= 0
assert col.colidx >= 0


@pytest.mark.parametrize(
Expand Down
8 changes: 8 additions & 0 deletions tests/document/test_document_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,15 @@ def test_column_index():

def test_document_columns():
"""Test creating instances of document schema columns."""
# -- Column without index position ----------------------------------------
col = Column(colid=1, name='my_col')
assert col == 'my_col'
assert isinstance(col, str)
assert col.colid == 1
assert col.colidx is None
# -- Column with index position -------------------------------------------
col = Column(colid=1, name='my_col', colidx=10)
assert col == 'my_col'
assert isinstance(col, str)
assert col.colid == 1
assert col.colidx == 10
20 changes: 18 additions & 2 deletions tests/document/test_json_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,13 @@ def test_json_document_without_key():
doc = JsonDocument(
doc={'columns': ['Name', 'Age'], 'data': [['Bob', 23], ['Alice', 24]]}
)
# -- Test schema identifier and index -------------------------------------
columns = dict()
for col in doc.columns:
columns[col] = (col.colid, col.colidx)
assert columns['Name'] == (-1, 0)
assert columns['Age'] == (-1, 1)
# -- Test row values and positions ----------------------------------------
reader = doc.reader(schema=[Column(0, 'Name'), Column(1, 'Age')])
keys, positions, names = list(), list(), list()
while reader.has_next():
Expand All @@ -46,13 +53,22 @@ def test_json_document_without_key():

def test_json_document_with_pk():
"""Test creating an instance of the Json document with a primary key."""
SCHEMA = [{'id': 1, 'name': 'Name'}, {'id': 0, 'name': 'Age'}]
doc = JsonDocument(
doc={
'columns': SCHEMA,
'columns': [
{'id': 1, 'name': 'Name'},
{'id': 0, 'name': 'Age'}
],
'data': [['Bob', 23], ['Alice', 24]],
'primaryKey': ['Name']}
)
# -- Test schema identifier and index -------------------------------------
columns = dict()
for col in doc.columns:
columns[col] = (col.colid, col.colidx)
assert columns['Name'] == (1, 0)
assert columns['Age'] == (0, 1)
# -- Test row values and positions ----------------------------------------
reader = doc.reader(schema=doc.columns)
keys, positions, names = list(), list(), list()
while reader.has_next():
Expand Down

0 comments on commit 4df4b00

Please sign in to comment.