CentreForDigitalHumanities · lukavdplas · Sep 11, 2023 · Aug 2, 2023 · Aug 2, 2023 · Aug 2, 2023
diff --git a/backend/README.md b/backend/README.md
@@ -65,6 +65,14 @@ If you are overriding the default settings, you may pass `--pythonpath` and `--s
 
 ### Running the application (development server)
 
+If you made any changes to your configured corpora, load them into the database before running the aplication.
+
+```console
+$ python manage.py loadcorpora
+```
+
+Start the development server with:
+
 ```console
 $ python manage.py runserver
 ```

diff --git a/backend/addcorpus/admin.py b/backend/addcorpus/admin.py
@@ -1,7 +1,140 @@
-from django.contrib import admin
-from .models import Corpus
+from django.contrib import admin, messages
+from .models import Corpus, CorpusConfiguration, Field
+
+def show_warning_message(request):
+    '''
+    Message to display when loading a form for a resource based on a python class
+    '''
+
+    messages.add_message(
+        request,
+        messages.WARNING,
+        'Corpus configurations are based on python classes; any changes here will be reset on server startup'
+    )
 
 class CorpusAdmin(admin.ModelAdmin):
-    readonly_fields = ['name', 'description']
+    readonly_fields = ['name', 'configuration']
+    fields = ['name', 'groups', 'configuration']
+
+class InlineFieldAdmin(admin.StackedInline):
+    model = Field
+    fields = ['display_name', 'description']
+    show_change_link = True
+    extra = 0
+
+class CorpusConfigurationAdmin(admin.ModelAdmin):
+    readonly_fields = ['corpus']
+
+    inlines = [
+        InlineFieldAdmin
+    ]
+
+    fieldsets = [
+        (
+            None,
+            {
+                'fields': [
+                    'corpus',
+                    'title',
+                    'description',
+                    'description_page',
+                    'image',
+                ]
+            }
+        ), (
+            'Content',
+            {
+                'fields': [
+                    'category',
+                    'languages',
+                    'min_date',
+                    'max_date',
+                    'document_context',
+                ]
+            }
+        ), (
+            'Elasticsearch',
+            {
+                'fields': [
+                    'es_index',
+                    'es_alias',
+                ]
+            }
+        ), (
+            'Scans',
+            {
+                'fields': [
+                    'scan_image_type',
+                    'allow_image_download',
+                ]
+            }
+        ), (
+            'Word models',
+            {
+                'fields': ['word_models_present']
+            }
+        )
+    ]
+
+    def get_form(self, request, obj=None, **kwargs):
+        show_warning_message(request)
+        return super().get_form(request, obj, **kwargs)
+
+
+class FieldAdmin(admin.ModelAdmin):
+    readonly_fields = ['corpus_configuration']
+
+    fieldsets = [
+        (
+            None,
+            {
+                'fields': [
+                    'name',
+                    'corpus_configuration',
+                    'display_name',
+                    'description',
+                    'hidden',
+                    'downloadable',
+                ]
+            }
+        ),
+        (
+            'Indexing options',
+            {
+                'fields': [
+                    'es_mapping',
+                    'indexed',
+                    'required',
+                ]
+            }
+        ), (
+            'Search interface',
+            {
+                'fields': [
+                    'search_filter',
+                    'results_overview',
+                    'searchable',
+                    'search_field_core',
+                    'sortable',
+                    'primary_sort',
+                ]
+            }
+        ), (
+            'Visualisations',
+            {
+                'fields': [
+                    'visualizations',
+                    'visualization_sort',
+                ]
+            }
+        )
+    ]
+
+    def get_form(self, request, obj=None, **kwargs):
+        show_warning_message(request)
+        return super().get_form(request, obj, **kwargs)
+
 
 admin.site.register(Corpus, CorpusAdmin)
+admin.site.register(CorpusConfiguration, CorpusConfigurationAdmin)
+admin.site.register(Field, FieldAdmin)
diff --git a/backend/addcorpus/corpus.py b/backend/addcorpus/corpus.py
@@ -3,10 +3,7 @@
 '''
 
 from . import extract
-from zipfile import ZipExtFile
 import itertools
-import inspect
-import json
 import bs4
 import csv
 import sys
@@ -19,8 +16,9 @@
 from addcorpus.constants import CATEGORIES
 
 import logging
-logger = logging.getLogger('indexing')
+from ianalyzer.settings import NEW_HIGHLIGHT_CORPORA
 
+logger = logging.getLogger('indexing')
 
 
 class CorpusDefinition(object):
@@ -235,72 +233,6 @@ def es_mapping(self):
             }
         }
 
-    def json(self):
-        '''
-        Corpora should be able to produce JSON, so that the fields they define
-        can be used by other codebases, while retaining the Python class as the
-        single source of truth.
-        '''
-        corpus_dict = self.serialize()
-        json_dict = json.dumps(corpus_dict)
-        return json_dict
-
-    def serialize(self):
-        """
-        Convert corpus object to a JSON-friendly dict format.
-        """
-        corpus_dict = {}
-
-        # gather attribute names
-        # exclude:
-        # - methods not implemented in Corpus class
-        # - hidden attributes
-        # - attributes listed in `exclude`
-        # - bound methods
-        exclude = ['data_directory', 'es_settings', 'word_model_path']
-        corpus_attribute_names = [
-            a for a in dir(self)
-            if a in dir(CorpusDefinition) and not a.startswith('_') and a not in exclude and not inspect.ismethod(self.__getattribute__(a))
-        ]
-
-        # collect values
-        corpus_attributes = [(a, getattr(self, a)) for a in corpus_attribute_names ]
-
-        for ca in corpus_attributes:
-            if ca[0] == 'fields':
-                field_list = []
-                for field in self.fields:
-                    field_list.append(field.serialize())
-                corpus_dict[ca[0]] = field_list
-            elif ca[0] == 'languages':
-                format = lambda tag: Language.make(standardize_tag(tag)).display_name() if tag else 'Unknown'
-                corpus_dict[ca[0]] = [
-                    format(tag)
-                    for tag in ca[1]
-                ]
-            elif ca[0] == 'category':
-                corpus_dict[ca[0]] =  self._format_option(ca[1], CATEGORIES)
-            elif type(ca[1]) == datetime:
-                timedict = {'year': ca[1].year,
-                            'month': ca[1].month,
-                            'day': ca[1].day,
-                            'hour': ca[1].hour,
-                            'minute': ca[1].minute}
-                corpus_dict[ca[0]] = timedict
-            else:
-                corpus_dict[ca[0]] = ca[1]
-        return corpus_dict
-
-    def _format_option(self, value, options):
-        '''
-        For serialisation: format language or category based on list of options
-        '''
-        return next(
-            nice_string
-            for code, nice_string in options
-            if value == code
-        )
-
     def sources(self, start=datetime.min, end=datetime.max):
         '''
         Obtain source files for the corpus, relevant to the given timespan.
@@ -753,13 +685,13 @@ def __init__(self,
                  name=None,
                  display_name=None,
                  display_type=None,
-                 description=None,
+                 description='',
                  indexed=True,
                  hidden=False,
                  results_overview=False,
                  csv_core=False,
                  search_field_core=False,
-                 visualizations=None,
+                 visualizations=[],
                  visualization_sort=None,
                  es_mapping={'type': 'text'},
                  search_filter=None,
@@ -772,9 +704,11 @@ def __init__(self,
                  **kwargs
                  ):
 
+        mapping_type = es_mapping['type']
+
         self.name = name
-        self.display_name = display_name
-        self.display_type = display_type
+        self.display_name = display_name or name
+        self.display_type = display_type or mapping_type
         self.description = description
         self.search_filter = search_filter
         self.results_overview = results_overview
@@ -790,41 +724,22 @@ def __init__(self,
 
         self.sortable = sortable if sortable != None else \
             not hidden and indexed and \
-            es_mapping['type'] in ['integer', 'float', 'date']
+            mapping_type in ['integer', 'float', 'date']
 
         self.primary_sort = primary_sort
 
         # Fields are searchable if they are not hidden and if they are mapped as 'text'.
         # Keyword fields without a filter are also searchable.
         self.searchable = searchable if searchable != None else \
             not hidden and indexed and \
-            ((self.es_mapping['type'] == 'text') or
-             (self.es_mapping['type'] == 'keyword' and self.search_filter == None))
+            ((mapping_type == 'text') or
+             (mapping_type == 'keyword' and self.search_filter == None))
         # Add back reference to field in filter
         self.downloadable = downloadable
 
         if self.search_filter:
             self.search_filter.field = self
 
-    def serialize(self):
-        """
-        Convert Field object to a JSON-friendly dict format.
-        """
-        field_dict = {}
-        for key, value in self.__dict__.items():
-            if key == 'search_filter' and value != None:
-                filter_name = str(type(value)).split(
-                    sep='.')[-1][:-2]
-                search_dict = {'name': filter_name}
-                for search_key, search_value in value.__dict__.items():
-                    if search_key == 'search_filter' or search_key != 'field':
-                        search_dict[search_key] = search_value
-                field_dict['search_filter'] = search_dict
-            elif key != 'extractor':
-                field_dict[key] = value
-
-        return field_dict
-
 
 # Helper functions ############################################################
 

diff --git a/backend/addcorpus/filters.py b/backend/addcorpus/filters.py
@@ -10,21 +10,27 @@ class Filter(object):
     A filter is the interface between the form that is presented to users and
     the ElasticSearch filter that is sent to the client.
     '''
-    # TODO Far as I can tell, this is a specific implementation of a problem
-    # with which WTForms deals in general. Therefore, this should be embedded
-    # in WTForms.
-
-
+
     def __init__(self, description=None):
         self.field = None # Must be filled after initialising
         self.description = description
-
+
+    def serialize(self):
+        name = str(type(self)).split(sep='.')[-1][:-2]
+        search_dict = {'name': name}
+        for key, value in self.__dict__.items():
+            if key == 'search_filter' or key != 'field':
+                if type(value) == datetime:
+                    search_dict[key] = value.isoformat()
+                else:
+                    search_dict[key] = value
+        return search_dict
 
 class DateFilter(Filter):
     '''
     Filter for datetime values: produces two datepickers for min and max date.
     '''
-    
+
     def __init__(self, lower, upper, *nargs, **kwargs):
         self.lower = lower
         self.upper = upper
@@ -35,7 +41,7 @@ class RangeFilter(Filter):
     '''
     Filter for numerical values: produces a slider between two values.
     '''
-    
+
     def __init__(self, lower, upper, *nargs, **kwargs):
         self.lower = lower
         self.upper = upper
@@ -46,7 +52,7 @@ class MultipleChoiceFilter(Filter):
     '''
     Filter for keyword values: produces a set of buttons.
     '''
-    
+
     def __init__(self, option_count=10, *nargs, **kwargs):
         self.option_count = option_count
         # option_count defines how many buckets are retrieved
@@ -58,7 +64,7 @@ class BooleanFilter(Filter):
     '''
     Filter for boolean values: produces a drop-down menu.
     ''' #TODO checkbox?
-    
+
     def __init__(self, true, false, *nargs, **kwargs):
         self.true = true
         self.false = false