From 5b880eb710850940f5f476522a12c8ea6bbb6099 Mon Sep 17 00:00:00 2001 From: Graham Hukill Date: Tue, 22 Aug 2023 13:48:30 -0400 Subject: [PATCH 1/2] update dependencies, add ipython as dev dependency --- Pipfile | 1 + Pipfile.lock | 213 ++++++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 178 insertions(+), 36 deletions(-) diff --git a/Pipfile b/Pipfile index 0c31ed8..1f5f964 100644 --- a/Pipfile +++ b/Pipfile @@ -22,6 +22,7 @@ flake8 = "*" isort = "*" mypy = "*" pytest = "*" +ipython = "*" [requires] python_version = "3.11" diff --git a/Pipfile.lock b/Pipfile.lock index 22ef5e9..36dc669 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "73cbd531ce329d9ebd99e42ac910bee90935e0d6f9f0d1096b4b2492790a1cb7" + "sha256": "d8d1a9284afa82486c23db55536a78974489321be96ca82fe6ffc8fcc3a4ee00" }, "pipfile-spec": 6, "requires": { @@ -34,18 +34,18 @@ }, "boto3": { "hashes": [ - "sha256:8da9621931291b6c261fdaae465f05737c16519b9667d8463181cb8b88444572", - "sha256:a336cf53a6d86ee6d27b2f6d8b78ec9b320209127e5126359881bbd68f33d0b9" + "sha256:2761f3249fe25c3ec1a8cd6b95fca2317747503e6f1d127daf6a3d2cdeb25680", + "sha256:dc6d72470f6d8926b8cdc10ee7708d7ccdd36d6313c7aa298bc1cf6bedb8921e" ], - "version": "==1.28.27" + "version": "==1.28.31" }, "botocore": { "hashes": [ - "sha256:13af1588023750c9bc66d202bb5a934c9412a7dc52587532264ab725c42c2c50", - "sha256:739d09e13751e3b9b0f341b5ffe5bf8d0452b8769d435c4084ee88739d42b7f7" + "sha256:1eef14ae98e8662e43f7cf6d993c732793def02644e2d489c5171d3b9269e900", + "sha256:950a49c5286fe1f6d72cfbe2910b9ddbdfbb907975ddc41cf38ac9709b4d1291" ], "markers": "python_version >= '3.7'", - "version": "==1.31.27" + "version": "==1.31.31" }, "certifi": { "hashes": [ @@ -57,11 +57,11 @@ }, "click": { "hashes": [ - "sha256:48ee849951919527a045bfe3bf7baa8a959c423134e1a5b98c05c20ba75a1cbd", - "sha256:fa244bb30b3b5ee2cae3da8f55c9e5e0c0e86093306301fb418eb9dc40fbded5" + "sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28", + "sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de" ], "index": "pypi", - "version": "==8.1.6" + "version": "==8.1.7" }, "jmespath": { "hashes": [ @@ -238,6 +238,28 @@ } }, "develop": { + "appnope": { + "hashes": [ + "sha256:02bd91c4de869fbb1e1c50aafc4098827a7a54ab2f39d9dcba6c9547ed920e24", + "sha256:265a455292d0bd8a72453494fa24df5a11eb18373a60c7c0430889f22548605e" + ], + "markers": "sys_platform == 'darwin'", + "version": "==0.1.3" + }, + "asttokens": { + "hashes": [ + "sha256:4622110b2a6f30b77e1473affaa97e711bc2f07d3f10848420ff1898edbe94f3", + "sha256:6b0ac9e93fb0335014d382b8fa9b3afa7df546984258005da0b9e7095b3deb1c" + ], + "version": "==2.2.1" + }, + "backcall": { + "hashes": [ + "sha256:5cbdbf27be5e7cfadb448baf0aa95508f91f2bbc6c6437cd9cd06e2a4c215e1e", + "sha256:fbbce6a29f263178a1f7915c1940bde0ec2b2a967566fe1c65c1dfb7422bd255" + ], + "version": "==0.2.0" + }, "bandit": { "hashes": [ "sha256:75665181dc1e0096369112541a056c59d1c5f66f9bb74a8d686c3c362b83f549", @@ -365,11 +387,11 @@ }, "click": { "hashes": [ - "sha256:48ee849951919527a045bfe3bf7baa8a959c423134e1a5b98c05c20ba75a1cbd", - "sha256:fa244bb30b3b5ee2cae3da8f55c9e5e0c0e86093306301fb418eb9dc40fbded5" + "sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28", + "sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de" ], "index": "pypi", - "version": "==8.1.6" + "version": "==8.1.7" }, "coverage": { "hashes": [ @@ -435,12 +457,27 @@ "index": "pypi", "version": "==3.3.1" }, + "decorator": { + "hashes": [ + "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330", + "sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186" + ], + "markers": "python_version >= '3.5'", + "version": "==5.1.1" + }, "docopt": { "hashes": [ "sha256:49b3a825280bd66b3aa83585ef59c4a8c82f2c8a522dbe754a8bc8d08c85c491" ], "version": "==0.6.2" }, + "executing": { + "hashes": [ + "sha256:0314a69e37426e3608aada02473b4161d4caf5a4b244d1d0c48072b8fee7bacc", + "sha256:19da64c18d2d851112f09c287f8d3dbbdf725ab0e569077efb6cdcbd3497c107" + ], + "version": "==1.2.0" + }, "flake8": { "hashes": [ "sha256:d5b3857f07c030bdb5bf41c7f53799571d75c4491748a3adcd47de929e34cd23", @@ -481,6 +518,14 @@ "markers": "python_version >= '3.7'", "version": "==2.0.0" }, + "ipython": { + "hashes": [ + "sha256:1d197b907b6ba441b692c48cf2a3a2de280dc0ac91a3405b39349a50272ca0a1", + "sha256:248aca623f5c99a6635bc3857677b7320b9b8039f99f070ee0d20a5ca5a8e6bf" + ], + "index": "pypi", + "version": "==8.14.0" + }, "isort": { "hashes": [ "sha256:8bef7dde241278824a6d83f44a544709b065191b95b6e50894bdc722fcba0504", @@ -489,6 +534,14 @@ "index": "pypi", "version": "==5.12.0" }, + "jedi": { + "hashes": [ + "sha256:bcf9894f1753969cbac8022a8c2eaee06bfa3724e4192470aaffe7eb6272b0c4", + "sha256:cb8ce23fbccff0025e9386b5cf85e892f94c9b822378f8da49970471335ac64e" + ], + "markers": "python_version >= '3.6'", + "version": "==0.19.0" + }, "markdown-it-py": { "hashes": [ "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1", @@ -497,6 +550,14 @@ "markers": "python_version >= '3.8'", "version": "==3.0.0" }, + "matplotlib-inline": { + "hashes": [ + "sha256:f1f41aab5328aa5aaea9b16d083b128102f8712542f819fe7e6a420ff581b311", + "sha256:f887e5f10ba98e8d2b150ddcf4702c1e5f8b3a20005eb0f74bfdbd360ee6f304" + ], + "markers": "python_version >= '3.5'", + "version": "==0.1.6" + }, "mccabe": { "hashes": [ "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325", @@ -515,31 +576,36 @@ }, "mypy": { "hashes": [ - "sha256:1fe816e26e676c1311b9e04fd576543b873576d39439f7c24c8e5c7728391ecf", - "sha256:2c9d570f53908cbea326ad8f96028a673b814d9dca7515bf71d95fa662c3eb6f", - "sha256:35b13335c6c46a386577a51f3d38b2b5d14aa619e9633bb756bd77205e4bd09f", - "sha256:372fd97293ed0076d52695849f59acbbb8461c4ab447858cdaeaf734a396d823", - "sha256:42170e68adb1603ccdc55a30068f72bcfcde2ce650188e4c1b2a93018b826735", - "sha256:69b32d0dedd211b80f1b7435644e1ef83033a2af2ac65adcdc87c38db68a86be", - "sha256:725b57a19b7408ef66a0fd9db59b5d3e528922250fb56e50bded27fea9ff28f0", - "sha256:769ddb6bfe55c2bd9c7d6d7020885a5ea14289619db7ee650e06b1ef0852c6f4", - "sha256:79c520aa24f21852206b5ff2cf746dc13020113aa73fa55af504635a96e62718", - "sha256:84cf9f7d8a8a22bb6a36444480f4cbf089c917a4179fbf7eea003ea931944a7f", - "sha256:9166186c498170e1ff478a7f540846b2169243feb95bc228d39a67a1a450cdc6", - "sha256:a2500ad063413bc873ae102cf655bf49889e0763b260a3a7cf544a0cbbf7e70a", - "sha256:a551ed0fc02455fe2c1fb0145160df8336b90ab80224739627b15ebe2b45e9dc", - "sha256:ad3109bec37cc33654de8db30fe8ff3a1bb57ea65144167d68185e6dced9868d", - "sha256:b4ea3a0241cb005b0ccdbd318fb99619b21ae51bcf1660b95fc22e0e7d3ba4a1", - "sha256:c36011320e452eb30bec38b9fd3ba20569dc9545d7d4540d967f3ea1fab9c374", - "sha256:c8a7444d6fcac7e2585b10abb91ad900a576da7af8f5cffffbff6065d9115813", - "sha256:cbf18f8db7e5f060d61c91e334d3b96d6bb624ddc9ee8a1cde407b737acbca2c", - "sha256:d145b81a8214687cfc1f85c03663a5bbe736777410e5580e54d526e7e904f564", - "sha256:eec5c927aa4b3e8b4781840f1550079969926d0a22ce38075f6cfcf4b13e3eb4", - "sha256:f3460f34b3839b9bc84ee3ed65076eb827cd99ed13ed08d723f9083cada4a212", - "sha256:f3940cf5845b2512b3ab95463198b0cdf87975dfd17fdcc6ce9709a9abe09e69" + "sha256:159aa9acb16086b79bbb0016145034a1a05360626046a929f84579ce1666b315", + "sha256:258b22210a4a258ccd077426c7a181d789d1121aca6db73a83f79372f5569ae0", + "sha256:26f71b535dfc158a71264e6dc805a9f8d2e60b67215ca0bfa26e2e1aa4d4d373", + "sha256:26fb32e4d4afa205b24bf645eddfbb36a1e17e995c5c99d6d00edb24b693406a", + "sha256:2fc3a600f749b1008cc75e02b6fb3d4db8dbcca2d733030fe7a3b3502902f161", + "sha256:32cb59609b0534f0bd67faebb6e022fe534bdb0e2ecab4290d683d248be1b275", + "sha256:330857f9507c24de5c5724235e66858f8364a0693894342485e543f5b07c8693", + "sha256:361da43c4f5a96173220eb53340ace68cda81845cd88218f8862dfb0adc8cddb", + "sha256:4a465ea2ca12804d5b34bb056be3a29dc47aea5973b892d0417c6a10a40b2d65", + "sha256:51cb1323064b1099e177098cb939eab2da42fea5d818d40113957ec954fc85f4", + "sha256:57b10c56016adce71fba6bc6e9fd45d8083f74361f629390c556738565af8eeb", + "sha256:596fae69f2bfcb7305808c75c00f81fe2829b6236eadda536f00610ac5ec2243", + "sha256:5d627124700b92b6bbaa99f27cbe615c8ea7b3402960f6372ea7d65faf376c14", + "sha256:6ac9c21bfe7bc9f7f1b6fae441746e6a106e48fc9de530dea29e8cd37a2c0cc4", + "sha256:82cb6193de9bbb3844bab4c7cf80e6227d5225cc7625b068a06d005d861ad5f1", + "sha256:8f772942d372c8cbac575be99f9cc9d9fb3bd95c8bc2de6c01411e2c84ebca8a", + "sha256:9fece120dbb041771a63eb95e4896791386fe287fefb2837258925b8326d6160", + "sha256:a156e6390944c265eb56afa67c74c0636f10283429171018446b732f1a05af25", + "sha256:a9ec1f695f0c25986e6f7f8778e5ce61659063268836a38c951200c57479cc12", + "sha256:abed92d9c8f08643c7d831300b739562b0a6c9fcb028d211134fc9ab20ccad5d", + "sha256:b031b9601f1060bf1281feab89697324726ba0c0bae9d7cd7ab4b690940f0b92", + "sha256:c543214ffdd422623e9fedd0869166c2f16affe4ba37463975043ef7d2ea8770", + "sha256:d28ddc3e3dfeab553e743e532fb95b4e6afad51d4706dd22f28e1e5e664828d2", + "sha256:f33592ddf9655a4894aef22d134de7393e95fcbdc2d15c1ab65828eee5c66c70", + "sha256:f6b0e77db9ff4fda74de7df13f30016a0a663928d669c9f2c057048ba44f09bb", + "sha256:f757063a83970d67c444f6e01d9550a7402322af3557ce7630d3c957386fa8f5", + "sha256:ff0cedc84184115202475bbb46dd99f8dcb87fe24d5d0ddfc0fe6b8575c88d2f" ], "index": "pypi", - "version": "==1.5.0" + "version": "==1.5.1" }, "mypy-extensions": { "hashes": [ @@ -557,6 +623,14 @@ "markers": "python_version >= '3.7'", "version": "==23.1" }, + "parso": { + "hashes": [ + "sha256:8c07be290bb59f03588915921e29e8a50002acaf2cdc5fa0e0114f91709fafa0", + "sha256:c001d4636cd3aecdaf33cbb40aebb59b094be2a74c556778ef5576c175e19e75" + ], + "markers": "python_version >= '3.6'", + "version": "==0.8.3" + }, "pathspec": { "hashes": [ "sha256:1d6ed233af05e679efb96b1851550ea95bbb64b7c490b0f5aa52996c11e92a20", @@ -573,6 +647,21 @@ "markers": "python_version >= '2.6'", "version": "==5.11.1" }, + "pexpect": { + "hashes": [ + "sha256:0b48a55dcb3c05f3329815901ea4fc1537514d6ba867a152b581d69ae3710937", + "sha256:fc65a43959d153d0114afe13997d439c22823a27cefceb5ff35c2178c6784c0c" + ], + "markers": "sys_platform != 'win32'", + "version": "==4.8.0" + }, + "pickleshare": { + "hashes": [ + "sha256:87683d47965c1da65cdacaf31c8441d12b8044cdec9aca500cd78fc2c683afca", + "sha256:9649af414d74d4df115d5d718f82acb59c9d418196b7b4290ed47a12ce62df56" + ], + "version": "==0.7.5" + }, "platformdirs": { "hashes": [ "sha256:b45696dab2d7cc691a3226759c0d3b00c47c8b6e293d96f6436f733303f77f6d", @@ -589,6 +678,28 @@ "markers": "python_version >= '3.7'", "version": "==1.2.0" }, + "prompt-toolkit": { + "hashes": [ + "sha256:04505ade687dc26dc4284b1ad19a83be2f2afe83e7a828ace0c72f3a1df72aac", + "sha256:9dffbe1d8acf91e3de75f3b544e4842382fc06c6babe903ac9acb74dc6e08d88" + ], + "markers": "python_full_version >= '3.7.0'", + "version": "==3.0.39" + }, + "ptyprocess": { + "hashes": [ + "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35", + "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220" + ], + "version": "==0.7.0" + }, + "pure-eval": { + "hashes": [ + "sha256:01eaab343580944bc56080ebe0a674b39ec44a945e6d09ba7db3cb8cec289350", + "sha256:2b45320af6dfaa1750f543d714b6d1c520a1688dec6fd24d339063ce0aaa9ac3" + ], + "version": "==0.2.2" + }, "pycodestyle": { "hashes": [ "sha256:259bcc17857d8a8b3b4a2327324b79e5f020a13c16074670f9c8c8f872ea76d0", @@ -683,6 +794,14 @@ "markers": "python_full_version >= '3.7.0'", "version": "==13.5.2" }, + "six": { + "hashes": [ + "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926", + "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==1.16.0" + }, "smmap": { "hashes": [ "sha256:2aba19d6a040e78d8b09de5c57e96207b09ed71d8e55ce0959eeee6c8e190d94", @@ -691,6 +810,13 @@ "markers": "python_version >= '3.6'", "version": "==5.0.0" }, + "stack-data": { + "hashes": [ + "sha256:32d2dd0376772d01b6cb9fc996f3c8b57a357089dec328ed4b6553d037eaf815", + "sha256:cbb2a53eb64e5785878201a97ed7c7b94883f48b87bfb0bbe8b623c74679e4a8" + ], + "version": "==0.6.2" + }, "stevedore": { "hashes": [ "sha256:8cc040628f3cea5d7128f2e76cf486b2251a4e543c7b938f58d9a377f6694a2d", @@ -699,6 +825,14 @@ "markers": "python_version >= '3.8'", "version": "==5.1.0" }, + "traitlets": { + "hashes": [ + "sha256:9e6ec080259b9a5940c797d58b613b5e31441c2257b87c2e795c5228ae80d2d8", + "sha256:f6cde21a9c68cf756af02035f72d5a723bf607e862e7be33ece505abf4a3bad9" + ], + "markers": "python_version >= '3.7'", + "version": "==5.9.0" + }, "typing-extensions": { "hashes": [ "sha256:440d5dd3af93b060174bf433bccd69b0babc3b15b1a8dca43789fd7f61514b36", @@ -714,6 +848,13 @@ ], "markers": "python_version >= '3.6'", "version": "==1.26.16" + }, + "wcwidth": { + "hashes": [ + "sha256:795b138f6875577cd91bba52baf9e445cd5118fd32723b460e30a0af30ea230e", + "sha256:a5220780a404dbe3353789870978e472cfe477761f06ee55077256e509b156d0" + ], + "version": "==0.2.6" } } } From 0ba0781c96096b25747dbd854321341cfab2076f Mon Sep 17 00:00:00 2001 From: Graham Hukill Date: Tue, 22 Aug 2023 13:53:45 -0400 Subject: [PATCH 2/2] Use @normal attribute for ASpace EAD dates Why these changes are being introduced: We receieved a Sentry error that a record had an invalid date range. As described in the linked ticket, it became apparent this was related to a combination of date parsing and validation, and how it varies between Transmogrifier and OpenSearch environments. It has been noted that the EAD unitdate field @normal attribute might contain a more normalized form of the date that could help with these kind of errors or ambiguity. How this addresses that need: The date parsing for ASpace EADs has been updated to use the @normal attribute, which provides a cleaner and more predictable string to work with for splitting as a date range. Side effects of this change: There are 60 ASpace EADs that do not have a @normal attribute in their archdesc.unitdate element. For these, it's possible they will lose a date value if they had been parsed previously. But it's believed this will not be the case, given their element text values were all also somewhat unusual. Relevant ticket(s): * https://mitlibraries.atlassian.net/browse/TIMX-235 * https://mitlibraries.atlassian.net/browse/TIMX-92 --- tests/fixtures/ead/ead_record_all_fields.xml | 2 +- ...cord_attribute_and_subfield_variations.xml | 40 ++------ tests/test_ead.py | 52 ++++------- transmogrifier/sources/ead.py | 92 ++++++++++++------- 4 files changed, 87 insertions(+), 99 deletions(-) diff --git a/tests/fixtures/ead/ead_record_all_fields.xml b/tests/fixtures/ead/ead_record_all_fields.xml index d742ea9..7bbed6a 100644 --- a/tests/fixtures/ead/ead_record_all_fields.xml +++ b/tests/fixtures/ead/ead_record_all_fields.xml @@ -32,7 +32,7 @@ VC.0002 Title 3 - 1905-2012 + 1905-2012 1234 diff --git a/tests/fixtures/ead/ead_record_attribute_and_subfield_variations.xml b/tests/fixtures/ead/ead_record_attribute_and_subfield_variations.xml index c2a9281..d3f959b 100644 --- a/tests/fixtures/ead/ead_record_attribute_and_subfield_variations.xml +++ b/tests/fixtures/ead/ead_record_attribute_and_subfield_variations.xml @@ -26,36 +26,16 @@ - - 1905-2012 - 1905 - - 1905-2012 - 1905 - - 1905-2012 - 1905 - - 1905-2012 - 1905 - abcd-efgh - abcd - - 1905-2012 - 1905 - - 1905-2012 - 1905 - abcd-efgh - abcd - - 1905-2012 - 1905 - abcd-efgh - abcd - - 1905-2012 - 1905 + 1905-2012 + 1905-2012 + 1905-2012 + 1905-2012 + 1953 November 9–10 + 1969-03-04 + 2023 + 1984-1989 + undated + 2001-1999 Data enclosed in subelement diff --git a/tests/test_ead.py b/tests/test_ead.py index 9dee37e..4855a58 100644 --- a/tests/test_ead.py +++ b/tests/test_ead.py @@ -385,41 +385,24 @@ def test_ead_record_with_attribute_and_subfield_variations_transforms_correctly( timdex.Date( range=timdex.Date_Range(gte="1905", lte="2012"), ), - timdex.Date(value="1905"), - timdex.Date( - range=timdex.Date_Range(gte="1905", lte="2012"), - ), - timdex.Date(value="1905"), - timdex.Date( - range=timdex.Date_Range(gte="1905", lte="2012"), - ), - timdex.Date(value="1905"), - timdex.Date( - kind="creation", - range=timdex.Date_Range(gte="1905", lte="2012"), - ), - timdex.Date(kind="creation", value="1905"), timdex.Date( kind="creation", range=timdex.Date_Range(gte="1905", lte="2012"), ), - timdex.Date(kind="creation", value="1905"), timdex.Date( - kind="creation", note="approximate", range=timdex.Date_Range(gte="1905", lte="2012"), ), - timdex.Date(kind="creation", note="approximate", value="1905"), timdex.Date( + kind="creation", note="approximate", range=timdex.Date_Range(gte="1905", lte="2012"), ), - timdex.Date(note="approximate", value="1905"), timdex.Date( - note="approximate", - range=timdex.Date_Range(gte="1905", lte="2012"), + range=timdex.Date_Range(gte="1953-11-09", lte="1953-11-10"), ), - timdex.Date(note="approximate", value="1905"), + timdex.Date(value="1969-03-04"), + timdex.Date(value="2023"), ], identifiers=[ timdex.Identifier( @@ -509,21 +492,18 @@ def test_ead_record_invalid_date_and_date_range_are_omitted(caplog): "tests/fixtures/ead/ead_record_attribute_and_subfield_variations.xml" ) output_record = next(Ead("aspace", ead_xml_records)) - assert "abcd" not in [d.value for d in output_record.dates] - assert "abcd" not in [ - d.range.gte for d in output_record.dates if "gte" in dir(d.range) - ] - assert "efgh" not in [ - d.range.lte for d in output_record.dates if "lte" in dir(d.range) - ] - assert ( - "Record ID 'repositories/2/resources/6' has invalid values in a date range: " - "'abcd', 'efgh'" - ) in caplog.text - assert ( - "Record ID 'repositories/2/resources/6' has a date that couldn't be parsed: " - "'abcd'" - ) in caplog.text + + for date in output_record.dates: + assert date.value != "undated" + assert date.value != "1984" + if date.range is not None: + assert date.range.gte != "1984" + assert date.range.lte != "1989" + assert date.range.gte != "2001" + assert date.range.lte != "1999" + + assert ("has a date that couldn't be parsed: 'undated'") in caplog.text + assert ("has a later start date than end date: '2001', '1999'") in caplog.text def test_ead_record_correct_identifiers_from_multiple_unitid(caplog): diff --git a/transmogrifier/sources/ead.py b/transmogrifier/sources/ead.py index 51670d3..d82b1dc 100644 --- a/transmogrifier/sources/ead.py +++ b/transmogrifier/sources/ead.py @@ -103,39 +103,11 @@ def get_optional_fields(self, xml: Tag) -> Optional[dict]: identifier=self.generate_name_identifier_url(name_element), ) ) + # dates - for date_element in collection_description_did.find_all("unitdate"): - if date_value := self.create_string_from_mixed_value( - date_element, - " ", - ): - date_instance = timdex.Date() - if "-" in date_value: - split = date_value.index("-") - gte_date = date_value[:split].strip() - lte_date = date_value[split + 1 :].strip() - if validate_date_range( - gte_date, - lte_date, - source_record_id, - ): - date_instance.range = timdex.Date_Range( - gte=gte_date, - lte=lte_date, - ) - else: - date_instance.value = ( - date_value.strip() - if validate_date( - date_value, - source_record_id, - ) - else None - ) - if date_instance.range or date_instance.value: - date_instance.kind = date_element.get("datechar") or None - date_instance.note = date_element.get("certainty") or None - fields.setdefault("dates", []).append(date_instance) + dates = self.parse_dates(collection_description_did, source_record_id) + if dates: + fields.setdefault("dates", []).extend(dates) # edition field not used in EAD @@ -452,3 +424,59 @@ def parse_mixed_value( elif isinstance(item, Tag) and item.name not in skipped_elements: for child in item.children: yield from cls.parse_mixed_value(child, skipped_elements) + + def parse_dates( + self, collection_description_did: Tag, source_record_id: str + ) -> list[timdex.Date]: + """ + Dedicated method to parse dates. Targeting archdesc.unitdata elements, using + only those with a @normal attribute value. These are almost uniformly ranges, + but in the event they are not (or two identical values for the range) a single + date value is produced. + """ + + dates = [] + for date_element in collection_description_did.find_all("unitdate"): + normal_date = date_element.get("normal", "").strip() + if normal_date == "": + continue + + date_instance = timdex.Date() + + # date range + if "/" in normal_date: + gte_date, lte_date = normal_date.split("/") + if gte_date != lte_date: + if validate_date_range( + gte_date, + lte_date, + source_record_id, + ): + date_instance.range = timdex.Date_Range( + gte=gte_date, + lte=lte_date, + ) + else: + date_str = gte_date # arbitrarily take one + if validate_date( + date_str, + source_record_id, + ): + date_instance.value = date_str + + # fallback on single date + else: + if validate_date( + normal_date, + source_record_id, + ): + date_instance.value = normal_date + + # include @datechar and @certainty attributes + date_instance.kind = date_element.get("datechar") + date_instance.note = date_element.get("certainty") + + if date_instance.range or date_instance.value: + dates.append(date_instance) + + return dates