diff --git a/pixel_classification/learn_sample.py b/pixel_classification/learn_sample.py index 89e2db7..42190c9 100644 --- a/pixel_classification/learn_sample.py +++ b/pixel_classification/learn_sample.py @@ -36,7 +36,8 @@ def classify(alg='softmax', data=None, path_to_pickled=None, binary=None): 'pickled dict of form:\n{}'.format(dct_form)) data = StructuredData(data) - pca = data.principal_components(return_percentile=95.) + data.make_binary(binary_true=binary, inplace=True) + data.principal_components(return_percentile=95.) mapping = {'softmax': softmax, 'neural_net': mlp} diff --git a/pixel_prep/compose_array.py b/pixel_prep/compose_array.py index 146a9a8..d3442d7 100644 --- a/pixel_prep/compose_array.py +++ b/pixel_prep/compose_array.py @@ -54,8 +54,7 @@ def load_irrigation_data(shapefile, rasters, pickle_path=None, :return: numpy.ndarray """ - df = point_target_extract(points=shapefile, nlcd_path=nlcd_path, - target_shapefile=target_shapefiles, + df = point_target_extract(points=shapefile, nlcd_path=nlcd_path, target_shapefile=target_shapefiles, count_limit=count) rasters = raster_paths(rasters) @@ -100,8 +99,8 @@ def recursive_file_gen(mydir): yield os.path.join(root, file) -def point_target_extract(points, nlcd_path, target_shapefile=None, - count_limit=None): +def point_target_extract(points, nlcd_path, + target_shapefile=None, count_limit=None): point_data = {} with fopen(points, 'r') as src: for feature in src: @@ -130,20 +129,25 @@ def point_target_extract(points, nlcd_path, target_shapefile=None, break if not has_attr: - with rasopen(nlcd_path, 'r') as rsrc: - rass_arr = rsrc.read() - rass_arr = rass_arr.reshape(rass_arr.shape[1], rass_arr.shape[2]) - affine = rsrc.affine - - x, y = val['coords'] - col, row = ~affine * (x, y) - raster_val = rass_arr[int(row), int(col)] + if nlcd_path: + with rasopen(nlcd_path, 'r') as rsrc: + rass_arr = rsrc.read() + rass_arr = rass_arr.reshape(rass_arr.shape[1], rass_arr.shape[2]) + affine = rsrc.affine + + x, y = val['coords'] + col, row = ~affine * (x, y) + raster_val = rass_arr[int(row), int(col)] + ltype_dct = {'IType': None, + 'LType': str(raster_val)} + point_data[pt_id]['properties'] = ltype_dct + print('id {} has no FLU, ' + 'nlcd {}'.format(pt_id, + nlcd_value(ltype_dct['LType']))) + else: ltype_dct = {'IType': None, - 'LType': str(raster_val)} + 'LType': None} point_data[pt_id]['properties'] = ltype_dct - print('id {} has no FLU, ' - 'nlcd {}'.format(pt_id, - nlcd_value(ltype_dct['LType']))) idd = [] ltype = [] diff --git a/pixel_prep/prep_structured_data.py b/pixel_prep/prep_structured_data.py index a0470c6..a3a55b2 100644 --- a/pixel_prep/prep_structured_data.py +++ b/pixel_prep/prep_structured_data.py @@ -19,48 +19,79 @@ import numpy as np from pandas import get_dummies from sklearn import decomposition +from sklearn.utils import Bunch class StructuredData(object): - """ Structured data object for ML training, not unlike a sklearn.dataset.load_dataset object""" + """ Structured data object for ML training, based on sklearn.utils.Bunch object""" + + # is passing in a dict required? + # especially for just two values + # also consider this + # what happens in this case + # data = {'foo'} + # sd =StructuredData(data) + # since 'data' and 'target_values' are required make them positional arguments + + def __init__(self, data, target_values, binary=None): + """ + + :param data: dict object like {'features': } + + """ - def __init__(self, data): self.lamda = None self.v = None - self.data = data + # saving data to this object is not necessary. + # just use it. + # The data dict is never referenced outside of this method + # self.data = data - self.x = self.data['data'].astype(np.float32) - self.y_strs = self.data['target_values'] + # self.x = self.data['data'].astype(np.float32) + # self.y_strs = self.data['target_values'] + self.x = data.astype(np.float32) + self.y_strs = target_values unique, self.y = np.unique(self.y_strs, return_inverse=True) self.classes = unique self.class_counts = {x: list(self.y_strs).count(x) for x in self.classes} print('Class counts: {}'.format(self.class_counts)) - self.class_map = dict(zip(list(unique), list(range(len(unique))))) + + # self.class_map = dict(zip(list(unique), list(range(len(unique))))) + # this is more consice than above + self.class_map = {u: i for i, u in enumerate(unique)} + print('Class integer map: {}'.format(self.class_map)) + if binary: + self.y[self.y_strs == binary] = 1 + self.y[self.y_strs != binary] = 0 + self.y_strs[self.y_strs != binary] = '{}{}'.format('N', binary) + self.one_hot = get_dummies(self.y).values - def make_binary(self, binary_true, inplace=False): - """ Use a key value that will equate to True (1), all others to 0.""" - """ - :param binary_true: - :return: - """ - if inplace: - self.y[self.y_strs == binary_true] = 1 - self.y[self.y_strs != binary_true] = 0 - self.y_strs[self.y_strs != binary_true] = '{}{}'.format('N', binary_true) - unique, _ = np.unique(self.y_strs, return_inverse=True) - self.classes = unique - self.class_counts = {x: list(self.y_strs).count(x) for x in self.classes} - self.one_hot = get_dummies(self.y).values - else: - new = copy.deepcopy(self) - self.make_binary(binary_true, inplace=True) - return new + # # def make_binary(self, binary_true, inplace=False): + # def make_binary(self, binary_true): + # """ Use a key value that will equate to True (1), all others to 0.""" + # """ + # :param binary_true: + # :return: + # """ + # self.y[self.y_strs == binary_true] = 1 + # self.y[self.y_strs != binary_true] = 0 + # self.y_strs[self.y_strs != binary_true] = '{}{}'.format('N', binary_true) + # unique, _ = np.unique(self.y_strs, return_inverse=True) + # self.classes = unique + # self.class_counts = {x: list(self.y_strs).count(x) for x in self.classes} + # self.one_hot = get_dummies(self.y).values + # + # # not advisable + # # else: + # # new = copy.deepcopy(self) + # # self.make_binary(binary_true, inplace=True) + # # return new def principal_components(self, return_percentile=None, n_components=None): """ Extract eigenvectors and eigenvalue, return desired PCAs"" @@ -73,12 +104,11 @@ def principal_components(self, return_percentile=None, n_components=None): pca = decomposition.PCA(0.95, copy=True, whiten=False) pca.fit(self.x) - print (np.cumsum(pca.explained_variance_ratio_)) + print(np.cumsum(pca.explained_variance_ratio_)) return pca if __name__ == '__main__': home = os.path.expanduser('~') - # ========================= EOF ================================================================ diff --git a/tests/data/LE07_L1TP_039027_20130726_20160907_01_T1_B3_clip.tif b/tests/data/LE07_clip_L1TP_039027_20130726_20160907_01_T1_B3.TIF old mode 100755 new mode 100644 similarity index 100% rename from tests/data/LE07_L1TP_039027_20130726_20160907_01_T1_B3_clip.tif rename to tests/data/LE07_clip_L1TP_039027_20130726_20160907_01_T1_B3.TIF diff --git a/tests/data/P39R27_Quarter_Test.pkl b/tests/data/P39R27_Quarter_Test.pkl new file mode 100644 index 0000000..9574b34 Binary files /dev/null and b/tests/data/P39R27_Quarter_Test.pkl differ diff --git a/tests/data/extract_test_attributed_Z12.dbf b/tests/data/extract_test_attributed_Z12.dbf deleted file mode 100755 index 521bc4c..0000000 Binary files a/tests/data/extract_test_attributed_Z12.dbf and /dev/null differ diff --git a/tests/data/extract_test_attributed_Z12.prj b/tests/data/extract_test_attributed_Z12.prj deleted file mode 100755 index 84eb0d8..0000000 --- a/tests/data/extract_test_attributed_Z12.prj +++ /dev/null @@ -1 +0,0 @@ -PROJCS["WGS_1984_UTM_Zone_12N",GEOGCS["GCS_WGS_1984",DATUM["D_WGS_1984",SPHEROID["WGS_1984",6378137,298.257223563]],PRIMEM["Greenwich",0],UNIT["Degree",0.017453292519943295]],PROJECTION["Transverse_Mercator"],PARAMETER["latitude_of_origin",0],PARAMETER["central_meridian",-111],PARAMETER["scale_factor",0.9996],PARAMETER["false_easting",500000],PARAMETER["false_northing",0],UNIT["Meter",1]] \ No newline at end of file diff --git a/tests/data/extract_test_attributed_Z12.qpj b/tests/data/extract_test_attributed_Z12.qpj deleted file mode 100755 index f29df82..0000000 --- a/tests/data/extract_test_attributed_Z12.qpj +++ /dev/null @@ -1 +0,0 @@ -PROJCS["WGS 84 / UTM zone 12N",GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0,AUTHORITY["EPSG","8901"]],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AUTHORITY["EPSG","4326"]],PROJECTION["Transverse_Mercator"],PARAMETER["latitude_of_origin",0],PARAMETER["central_meridian",-111],PARAMETER["scale_factor",0.9996],PARAMETER["false_easting",500000],PARAMETER["false_northing",0],UNIT["metre",1,AUTHORITY["EPSG","9001"]],AXIS["Easting",EAST],AXIS["Northing",NORTH],AUTHORITY["EPSG","32612"]] diff --git a/tests/data/extract_test_attributed_Z12.shp b/tests/data/extract_test_attributed_Z12.shp deleted file mode 100755 index a7167da..0000000 Binary files a/tests/data/extract_test_attributed_Z12.shp and /dev/null differ diff --git a/tests/data/extract_test_attributed_Z12.shx b/tests/data/extract_test_attributed_Z12.shx deleted file mode 100755 index a3bdaf3..0000000 Binary files a/tests/data/extract_test_attributed_Z12.shx and /dev/null differ diff --git a/tests/data/nlcd_clip_test.tif b/tests/data/nlcd_clip_test.tif new file mode 100755 index 0000000..65e6afc Binary files /dev/null and b/tests/data/nlcd_clip_test.tif differ diff --git a/tests/data/test.pkl b/tests/data/test.pkl new file mode 100644 index 0000000..320322a Binary files /dev/null and b/tests/data/test.pkl differ diff --git a/tests/test_compose_array.py b/tests/test_compose_array.py index aa45589..a32c7bf 100644 --- a/tests/test_compose_array.py +++ b/tests/test_compose_array.py @@ -14,63 +14,37 @@ # limitations under the License. # =============================================================================== +import os import unittest -from fiona import open as fopen -from rasterio import open as rasopen +from pixel_prep.compose_array import load_irrigation_data class TestPointExtract(unittest.TestCase): def setUp(self): - self.shapefile = 'tests/data/extract_test_attributed_Z12.shp' - self.raster = 'tests/data/LE07_L1TP_039027_20130726_20160907_01_T1_B3_clip.tif' + self.shapefile = 'data/extract_no_attrs_z12.shp' + self.raster = 'data/LE07_clip_L1TP_039027_20130726_20160907_01_T1_B3.TIF' + self.nlcd = 'data/nlcd_clip_test.tif' + self.target_polys = 'data/flu_test_z12.shp' + if not os.path.isfile(self.shapefile): + raise ValueError('Path to shapefile is invalid') def tearDown(self): pass - def test_raster_extract_by_point(self): + def test_compose_array(self): """ Test native pet rasters vs. xarray netcdf point extract. :return: """ - points = raster_point_extract(self.raster, self.shapefile) + points = load_irrigation_data(self.shapefile, self.raster, + nlcd_path=self.nlcd, + target_shapefiles=self.target_polys, + ) - for key, val in points.items(): - self.assertEqual(val['raster_val'], val['extract_value']) - - -# ----------------------------------ANCILLARY FUNCTIONS----------------------- - -def raster_point_extract(raster, points): - """ Get point values from a pixel_prep. - - :param raster: local_raster - :param points: Shapefile of points. - :return: Dict of coords, row/cols, and values of pixel_prep at that point. - """ - point_data = {} - - with fopen(points, 'r') as src: - for feature in src: - name = feature['id'] - proj_coords = feature['geometry']['coordinates'] - - point_data[name] = {'coords': proj_coords, - 'label': feature['properties']['LType'], - 'raster_val': int(feature['properties']['LE07_L1TP_'])} - - with rasopen(raster, 'r') as rsrc: - rass_arr = rsrc.read() - rass_arr = rass_arr.reshape(rass_arr.shape[1], rass_arr.shape[2]) - affine = rsrc.affine - - for key, val in point_data.items(): - x, y = val['coords'] - col, row = ~affine * (x, y) - raster_val = rass_arr[int(row), int(col)] - val['extract_value'] = raster_val - - return point_data + self.assertEqual(points['target_values'][0], ['I', 'I', 'I', 'F', 'I'][0]) + self.assertEqual(points['data'][0], [63, 51, 54, 82, 0][0]) + self.assertEqual(points['features'][0], '039027_T1') if __name__ == '__main__': diff --git a/tests/test_structured_data.py b/tests/test_structured_data.py new file mode 100644 index 0000000..67d7ada --- /dev/null +++ b/tests/test_structured_data.py @@ -0,0 +1,67 @@ +# ============================================================================================= +# Copyright 2018 dgketchum +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================================= + +import unittest +import pickle +from numpy import array, any +from pixel_prep.prep_structured_data import StructuredData + + +class StructuredDataTest(unittest.TestCase): + def setUp(self): + path_to_pickled = 'data/test.pkl' + with open(path_to_pickled, 'rb') as p: + data = pickle.load(p) + + self.struct = StructuredData(data) + + def test_data_instant(self): + # this assertion is really a test of the test which in general is not done + # self.assertIsInstance(self.struct, StructuredData) + self.assertEquals(self.struct.class_counts['I'], 4) + + def test_class_zero(self): + classes = self.struct.classes + self.assertEquals(classes[0], 'F') + # i try to only do one assert per test + # self.assertAlmostEqual(pca.mean_[0], 50.) + + def test_data_pca_mean(self): + pca = self.struct.principal_components(n_components=1) + self.assertAlmostEqual(pca.mean_[0], 50.) + + def test_data_binary(self): + self.struct.make_binary('I', inplace=True) + + # use self.assert...? + # assert (self.struct.one_hot == array([[0, 1], + # [0, 1], + # [0, 1], + # [1, 0], + # [0, 1], ])).any() + expected = array([[0, 1], + [0, 1], + [0, 1], + [1, 0], + [0, 1]]) + + self.assertListEqual(list(self.struct.one_hot), expected) + + +if __name__ == '__main__': + unittest.main() + +# ========================= EOF ====================================================================