Merge pull request #12 from heikomuller/dev0.1.4

Dev0.1.4
heikomuller · Oct 7, 2020 · 4df4b00 · 4df4b00
2 parents f9cce73 + c9bb602
commit 4df4b00
Show file tree

Hide file tree

Showing 8 changed files with 66 additions and 12 deletions.
diff --git a/changelog.md b/changelog.md
@@ -23,3 +23,8 @@
 ### 0.1.3 - 2020-10-05
 
 * Add archive manager that maintains descriptors in a relational database (\#8)
+
+
+### 0.1.4 - 2020-10-07
+
+* Add index position information to column class (\#11)
diff --git a/histore/archive/schema.py b/histore/archive/schema.py
@@ -223,7 +223,9 @@ def at_version(self, version):
         # Sort columns based on their position and return a list of snapshot
         # columns.
         cols.sort(key=lambda x: x[2])
-        return [Column(colid=id, name=name) for id, name, _ in cols]
+        return [
+            Column(colid=id, name=name, colidx=pos) for id, name, pos in cols
+        ]
 
     def merge(
         self, columns, version, matching=MATCH_IDNAME, renamed=None,

diff --git a/histore/document/mem/json.py b/histore/document/mem/json.py
@@ -87,9 +87,16 @@ def __init__(self, doc, validate=True):
         columns = list()
         for obj in doc['columns']:
             if isinstance(obj, dict):
-                columns.append(Column(colid=obj['id'], name=obj['name']))
+                col = Column(
+                    colid=obj['id'],
+                    name=obj['name'],
+                    colidx=len(columns)
+                )
             else:
-                columns.append(obj)
+                # Assumes that the object is a scalar value (string or number)
+                # representing the column name.
+                col = Column(colid=-1, name=obj, colidx=len(columns))
+            columns.append(col)
         # Get the document rows.
         rows = doc['data']
         # Create the keys for the document rows.

diff --git a/histore/document/schema.py b/histore/document/schema.py
@@ -10,16 +10,21 @@
 column value in a Pandas data frame.
 """
 
+from typing import List, Optional, Union
+
+"""Type alias for column lists."""
+Columns = Union[str, int, List[Union[int, str]]]
+
 
 class Column(str):
-    """Columns in openclean data frames are subclasses of Python strings that
+    """Columns in histore data frames are subclasses of Python strings that
     contain a unique column identifier. This implementation is based on:
     https://bytes.com/topic/python/answers/32098-my-experiences-subclassing-string
 
     The order of creation is that the __new__ method is called which returns
     the object then __init__ is called.
     """
-    def __new__(cls, colid, name, *args, **keywargs):
+    def __new__(cls, colid: int, name: str, colidx: Optional[int] = None):
         """Initialize the String object with the given column name. Ignore the
         column identifier.
 
@@ -29,10 +34,12 @@ def __new__(cls, colid, name, *args, **keywargs):
             Unique column identifier
         name: string
             Column name
+        colidx: int, default=None
+            Index position of the column in a dataset schema.
         """
         return str.__new__(cls, str(name))
 
-    def __init__(self, colid, name):
+    def __init__(self, colid: int, name: str, colidx: Optional[int] = None):
         """Initialize the unique column identifier. The column name has already
         been initialized by the __new__ method that is called prior to the
         __init__ method.
@@ -43,14 +50,17 @@ def __init__(self, colid, name):
             Unique column identifier
         name: string
             Column name
+        colidx: int, default=None
+            Index position of the column in a dataset schema.
         """
         self.colid = colid
+        self.colidx = colidx
 
 
 # -- Helper methods -----------------------------------------------------------
 
 
-def column_index(schema, columns):
+def column_index(schema: List[str], columns: Columns):
     """Get the list of column index positions in a given schema (list of
     column names). Columns are either specified by name or by index position.
 
@@ -63,7 +73,7 @@ def column_index(schema, columns):
     ----------
     schema: list(string)
         List of column names.
-    columns: list(int or str)
+    columns: int, str, or list(int or str)
         List of column index positions or column names.
 
     Returns

diff --git a/histore/version.py b/histore/version.py
@@ -6,4 +6,4 @@
 # file LICENSE for full license details.
 
 """Code version information for histore."""
-__version__ = '0.1.3'
+__version__ = '0.1.4'
diff --git a/tests/archive/manager/test_persistent_manager.py b/tests/archive/manager/test_persistent_manager.py
@@ -17,6 +17,7 @@
 from histore.archive.manager.db.database import DB, TEST_URL
 from histore.archive.manager.fs import FileSystemArchiveManager
 from histore.archive.manager.persist import PersistentArchiveManager
+from histore.document.schema import Column
 
 import histore.config as config
 
@@ -36,7 +37,7 @@ def test_create_archive(ManagerCls, kwargs, tmpdir):
     # -- Create empty manager instance ----------------------------------------
     manager = ManagerCls(**kwargs)
     assert len(manager.archives()) == 0
-    # -- Ad first archive -----------------------------------------------------
+    # -- Add irst archive -----------------------------------------------------
     descriptor = manager.create(
         name='First archive',
         description='My first archive',
@@ -93,6 +94,11 @@ def test_encoder_default(ManagerCls, kwargs, tmpdir):
     assert df.shape == (1, 1)
     assert df.iloc[0][0] == dt
     assert isinstance(df.iloc[0][0], datetime)
+    # DataFrane schema
+    for col in df.columns:
+        assert isinstance(col, Column)
+        assert col.colid >= 0
+        assert col.colidx >= 0
 
 
 @pytest.mark.parametrize(

diff --git a/tests/document/test_document_schema.py b/tests/document/test_document_schema.py
@@ -32,7 +32,15 @@ def test_column_index():
 
 def test_document_columns():
     """Test creating instances of document schema columns."""
+    # -- Column without index position ----------------------------------------
     col = Column(colid=1, name='my_col')
     assert col == 'my_col'
     assert isinstance(col, str)
     assert col.colid == 1
+    assert col.colidx is None
+    # -- Column with index position -------------------------------------------
+    col = Column(colid=1, name='my_col', colidx=10)
+    assert col == 'my_col'
+    assert isinstance(col, str)
+    assert col.colid == 1
+    assert col.colidx == 10
diff --git a/tests/document/test_json_document.py b/tests/document/test_json_document.py
@@ -32,6 +32,13 @@ def test_json_document_without_key():
     doc = JsonDocument(
         doc={'columns': ['Name', 'Age'], 'data': [['Bob', 23], ['Alice', 24]]}
     )
+    # -- Test schema identifier and index -------------------------------------
+    columns = dict()
+    for col in doc.columns:
+        columns[col] = (col.colid, col.colidx)
+    assert columns['Name'] == (-1, 0)
+    assert columns['Age'] == (-1, 1)
+    # -- Test row values and positions ----------------------------------------
     reader = doc.reader(schema=[Column(0, 'Name'), Column(1, 'Age')])
     keys, positions, names = list(), list(), list()
     while reader.has_next():
@@ -46,13 +53,22 @@ def test_json_document_without_key():
 
 def test_json_document_with_pk():
     """Test creating an instance of the Json document with a primary key."""
-    SCHEMA = [{'id': 1, 'name': 'Name'}, {'id': 0, 'name': 'Age'}]
     doc = JsonDocument(
         doc={
-            'columns': SCHEMA,
+            'columns': [
+                {'id': 1, 'name': 'Name'},
+                {'id': 0, 'name': 'Age'}
+            ],
             'data': [['Bob', 23], ['Alice', 24]],
             'primaryKey': ['Name']}
     )
+    # -- Test schema identifier and index -------------------------------------
+    columns = dict()
+    for col in doc.columns:
+        columns[col] = (col.colid, col.colidx)
+    assert columns['Name'] == (1, 0)
+    assert columns['Age'] == (0, 1)
+    # -- Test row values and positions ----------------------------------------
     reader = doc.reader(schema=doc.columns)
     keys, positions, names = list(), list(), list()
     while reader.has_next():