Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/corpus form #1659

Merged
merged 55 commits into from
Jan 15, 2025
Merged
Show file tree
Hide file tree
Changes from 34 commits
Commits
Show all changes
55 commits
Select commit Hold shift + click to select a range
e9f91bf
Allow corpus create with only title
JeltevanBoheemen Jul 31, 2024
13d10f1
Scaffold definition form in frontend
JeltevanBoheemen Aug 6, 2024
b5ef920
CorpusDataFile model and views
JeltevanBoheemen Aug 6, 2024
f57e3f6
Backend view for csv info
JeltevanBoheemen Aug 7, 2024
12ec2f5
Adapt backend field types to frontend model
JeltevanBoheemen Aug 13, 2024
0555989
Merge branch 'develop' into feature/corpus-form
JeltevanBoheemen Aug 13, 2024
e3cd17d
Fix corpus form tests
JeltevanBoheemen Aug 13, 2024
538203f
Show datafiles in corpus admin
JeltevanBoheemen Aug 13, 2024
14c627d
Upload sample data frontend
JeltevanBoheemen Aug 14, 2024
fe587e2
Set csv delimiter and read csv with it
JeltevanBoheemen Aug 14, 2024
f5f2f3c
Rename ApiCorpusField in CorpusDefinition
JeltevanBoheemen Aug 14, 2024
f513131
Polish corpus form
JeltevanBoheemen Aug 15, 2024
578ed39
Make fields from sample data
JeltevanBoheemen Aug 15, 2024
4a38f87
Single field form
JeltevanBoheemen Aug 21, 2024
6edf6c1
First version of fields form
JeltevanBoheemen Aug 29, 2024
90d7ab9
Dropdown for language selection
JeltevanBoheemen Sep 13, 2024
b62591d
Dropdown for corpus categories
JeltevanBoheemen Sep 13, 2024
c185280
Populate sample data info from database
JeltevanBoheemen Sep 13, 2024
1192591
Navigate to form after creation, else overview
JeltevanBoheemen Sep 13, 2024
ac27a66
Activate steps when fields already present
JeltevanBoheemen Sep 13, 2024
d2479b5
Provide some help text for fields
JeltevanBoheemen Sep 13, 2024
2741786
Merge branch 'develop' into feature/corpus-form
JeltevanBoheemen Sep 13, 2024
3a09ad9
Remove debug routes
JeltevanBoheemen Sep 13, 2024
a41a4b0
Use date input fieldsfor metadata
JeltevanBoheemen Sep 19, 2024
f6bab2e
Rename 'submit corpus' to 'save changes'
JeltevanBoheemen Sep 19, 2024
12fcade
Use empty array for languages instead of null
JeltevanBoheemen Sep 19, 2024
04907a4
Include field name and extract in form
JeltevanBoheemen Oct 1, 2024
19296e9
Remove the 'hidden' option in field form
JeltevanBoheemen Oct 1, 2024
3ae11d2
Show help texts for field types
JeltevanBoheemen Oct 1, 2024
bdf4ce6
Add boolean field type
JeltevanBoheemen Oct 1, 2024
a10906e
Conditionally show language picker
JeltevanBoheemen Oct 1, 2024
b8e190b
Rename edit to InOut, form to edit
JeltevanBoheemen Oct 3, 2024
ecb814d
Fix frontend test imports
JeltevanBoheemen Oct 3, 2024
68d4867
fix tests
lukavdplas Nov 6, 2024
3ce0b7e
Resolve minor form issues
JeltevanBoheemen Dec 10, 2024
14a5b4a
Merge branch 'develop' into feature/corpus-form
JeltevanBoheemen Dec 10, 2024
2787872
Rename migration
JeltevanBoheemen Dec 10, 2024
6c6f28a
Make is_date_col work with missing values
JeltevanBoheemen Dec 10, 2024
bb3cac5
Serialize datafile name instead of path
JeltevanBoheemen Dec 10, 2024
168ad63
Rename toggleStep to toggleStepDisabled
JeltevanBoheemen Dec 10, 2024
3edef17
add delimiter type
JeltevanBoheemen Dec 10, 2024
172a98f
use cloneDeep for nested objects
JeltevanBoheemen Dec 10, 2024
1023af9
Reword create from scratch
JeltevanBoheemen Dec 10, 2024
5880510
Clean up CorpusFormComponent template
JeltevanBoheemen Dec 10, 2024
0c3fe08
hide filter option for text content
JeltevanBoheemen Dec 10, 2024
8539528
Show/hide field form options for text fields
JeltevanBoheemen Dec 10, 2024
42b4527
Use paragraph elements for subtitles
JeltevanBoheemen Dec 10, 2024
d3bd57a
Use ISO 639-3 for language picker
JeltevanBoheemen Dec 10, 2024
05342e7
Language picker for meta form
JeltevanBoheemen Dec 10, 2024
34ad8b1
Remove dangercolor from reset sample button
JeltevanBoheemen Dec 10, 2024
c0a22c4
Use a table for sample component
JeltevanBoheemen Dec 10, 2024
57eadd1
Show column name in field form
JeltevanBoheemen Dec 11, 2024
ac41163
Extra help texts or corpus form
JeltevanBoheemen Dec 11, 2024
474ffc5
fix dropdown item height
lukavdplas Jan 15, 2025
95c8320
Merge branch 'develop' into feature/corpus-form
lukavdplas Jan 15, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion backend/addcorpus/admin.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from django.contrib import admin, messages
from .models import Corpus, CorpusConfiguration, Field, CorpusDocumentationPage
from .models import Corpus, CorpusConfiguration, CorpusDataFile, Field, CorpusDocumentationPage

def show_warning_message(request):
'''
Expand All @@ -14,18 +14,26 @@ def show_warning_message(request):
)


class InlineDatafileAdmin(admin.StackedInline):
model = CorpusDataFile
fields = ['file', 'is_sample']
show_change_link = True,
extra = 0

class CorpusAdmin(admin.ModelAdmin):
readonly_fields = ['configuration', 'ready_to_index', 'ready_to_publish']
fields = ['name', 'groups', 'configuration', 'has_python_definition', 'ready_to_index', 'ready_to_publish', 'active']
list_display = ['name', 'active']
list_filter = ['groups', 'active']
inlines = [InlineDatafileAdmin]

class InlineFieldAdmin(admin.StackedInline):
model = Field
fields = ['display_name', 'description']
show_change_link = True
extra = 0


class CorpusConfigurationAdmin(admin.ModelAdmin):
readonly_fields = ['corpus']

Expand Down
6 changes: 6 additions & 0 deletions backend/addcorpus/json_corpora/constants.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,8 @@
from datetime import date


DEFAULT_CSV_DELIMITER = ','
DATE_FORMAT = '%Y-%m-%d'

DEFAULT_MIN_DATE = date(1800, 1, 1)
DEFAULT_MAX_DATE = date.today()
15 changes: 8 additions & 7 deletions backend/addcorpus/json_corpora/import_json.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
from typing import List, Dict, Iterable, Optional
from datetime import datetime
from datetime import date, datetime


from addcorpus.models import Corpus, CorpusConfiguration, Field
from addcorpus.models import Field
from addcorpus.json_corpora.utils import get_path
from addcorpus import es_mappings
from addcorpus.constants import VisualizationType
from addcorpus.validation.publishing import _any_date_fields
from django.conf import settings
from addcorpus.json_corpora.constants import DEFAULT_CSV_DELIMITER, DATE_FORMAT
from addcorpus.json_corpora.constants import DEFAULT_CSV_DELIMITER, DATE_FORMAT, DEFAULT_MAX_DATE, DEFAULT_MIN_DATE

def import_json_corpus(data: Dict) -> Dict:
name = get_path(data, 'name')
Expand All @@ -33,9 +32,9 @@ def _parse_configuration(data: Dict) -> Dict:
'es_index': create_index_name(get_path(data, 'name')),
'languages': get_path(data, 'meta', 'languages'),
'min_date': _parse_date(
get_path(data, 'meta', 'date_range', 'min')),
get_path(data, 'meta', 'date_range', 'min'), DEFAULT_MIN_DATE),
'max_date': _parse_date(
get_path(data, 'meta', 'date_range', 'max')),
get_path(data, 'meta', 'date_range', 'max'), DEFAULT_MAX_DATE),
'default_sort': get_path(
data, 'options', 'default_sort') or {},
'language_field': get_path(
Expand All @@ -48,7 +47,9 @@ def _parse_configuration(data: Dict) -> Dict:
}


def _parse_date(date: str):
def _parse_date(date: Optional[str], fallback: Optional[date]):
if not date:
return fallback
return datetime.strptime(date, DATE_FORMAT).date()


Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# Generated by Django 4.2.14 on 2024-09-19 09:59

import addcorpus.models
import addcorpus.validation.creation
import django.contrib.postgres.fields
from django.db import migrations, models
import django.db.models.deletion


class Migration(migrations.Migration):

dependencies = [
('addcorpus', '0023_alter_corpusdocumentationpage_type_alter_field_name'),
]

operations = [
migrations.AlterField(
model_name='corpusconfiguration',
name='category',
field=models.CharField(blank=True, choices=[('parliament', 'Parliamentary debates'), ('periodical', 'Newspapers and other periodicals'), ('finance', 'Financial reports'), ('ruling', 'Court rulings'), ('review', 'Online reviews'), ('inscription', 'Funerary inscriptions'), ('oration', 'Orations'), ('book', 'Books'), ('informative', 'Informative')], help_text='category/medium of documents in this dataset', max_length=64, null=True),
),
migrations.AlterField(
model_name='corpusconfiguration',
name='description',
field=models.CharField(blank=True, help_text='short description of the corpus', max_length=254, null=True),
),
migrations.AlterField(
model_name='corpusconfiguration',
name='languages',
field=django.contrib.postgres.fields.ArrayField(base_field=models.CharField(blank=True, max_length=8, validators=[addcorpus.validation.creation.validate_language_code]), blank=True, help_text='languages used in the content of the corpus (from most to least frequent)', size=None),
),
migrations.CreateModel(
name='CorpusDataFile',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('file', models.FileField(help_text='file containing corpus data', upload_to=addcorpus.models.CorpusDataFile.upload_path)),
('is_sample', models.BooleanField(default=False, help_text='this file reflects only part of the total data, for use in creating the corpus definition')),
('created', models.DateTimeField(auto_now_add=True)),
('corpus', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='addcorpus.corpus')),
],
),
]
20 changes: 20 additions & 0 deletions backend/addcorpus/models.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
import warnings

from addcorpus.constants import CATEGORIES, MappingType, VisualizationType
Expand Down Expand Up @@ -165,10 +166,13 @@ class CorpusConfiguration(models.Model):
max_length=64,
choices=CATEGORIES,
help_text='category/medium of documents in this dataset',
blank=True,
null=True
)
description = models.CharField(
max_length=MAX_LENGTH_DESCRIPTION,
blank=True,
null=True,
help_text='short description of the corpus',
)
document_context = models.JSONField(
Expand Down Expand Up @@ -198,6 +202,7 @@ class CorpusConfiguration(models.Model):
blank=True,
),
help_text='languages used in the content of the corpus (from most to least frequent)',
blank=True,
)
min_date = models.DateField(
help_text='earliest date for the data in the corpus',
Expand Down Expand Up @@ -491,3 +496,18 @@ class Meta:
name='unique_documentation_type_for_corpus'
)
]


class CorpusDataFile(models.Model):
def upload_path(self, filename):
return os.path.join('corpus_datafiles', f'{self.corpus.pk}', filename)

corpus = models.ForeignKey(to=Corpus, on_delete=models.CASCADE)
file = models.FileField(upload_to=upload_path,
help_text='file containing corpus data')
is_sample = models.BooleanField(
default=False, help_text='this file reflects only part of the total data, for use in creating the corpus definition')
JeltevanBoheemen marked this conversation as resolved.
Show resolved Hide resolved
created = models.DateTimeField(auto_now_add=True)

def __str__(self):
return f'{self.file.name}'
17 changes: 13 additions & 4 deletions backend/addcorpus/serializers.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from rest_framework import serializers
from typing import Dict

from addcorpus.models import Corpus, CorpusConfiguration, Field, CorpusDocumentationPage
from addcorpus.models import Corpus, CorpusConfiguration, CorpusDataFile, Field, CorpusDocumentationPage
from addcorpus.constants import CATEGORIES
from langcodes import Language, standardize_tag
from addcorpus.documentation import render_documentation_context
Expand Down Expand Up @@ -196,9 +196,12 @@ def update(self, instance: Corpus, validated_data: Dict):
configuration.save()

for field_data in fields_data:
field, _ = Field.objects.get_or_create(
corpus_configuration=configuration, name=field_data['name']
)
try:
field = Field.objects.get(
corpus_configuration=configuration, name=field_data['name'])
except Field.DoesNotExist:
field = Field(corpus_configuration=configuration,
name=field_data['name'])
for attr in field_data:
setattr(field, attr, field_data[attr])
field.save()
Expand All @@ -210,3 +213,9 @@ def update(self, instance: Corpus, validated_data: Dict):
corpus.save()

return corpus


class CorpusDataFileSerializer(serializers.ModelSerializer):
class Meta:
model = CorpusDataFile
fields = '__all__'
JeltevanBoheemen marked this conversation as resolved.
Show resolved Hide resolved
11 changes: 11 additions & 0 deletions backend/addcorpus/tests/files/example.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
character,line,date-column,FLOAT COLUMN,int_column,bool column
"HAMLET","Whither wilt thou lead me? Speak, I'll go no further.","1256-10-08","7.03","13","True"
"GHOST","Mark me.","1435-10-07","4.74","89","True"
"HAMLET","I will.","1463-07-16","5.55","29","True"
"GHOST","My hour is almost come,","1634-08-09","3.04","100","True"
"GHOST","When I to sulph'rous and tormenting flames","1982-09-01","6.73","34","True"
"GHOST","Must render up myself.","1756-11-22","-0.58","-12","False"
"HAMLET","Alas, poor ghost!","1200-09-05","9.38","6","False"
"GHOST","Pity me not, but lend thy serious hearing","1633-11-18","8.84","83","False"
"GHOST","To what I shall unfold.","1445-11-09","3.6","97","False"
"HAMLET","Speak, I am bound to hear.","1984-08-12","-1.89","-4","False"
28 changes: 28 additions & 0 deletions backend/addcorpus/tests/test_datafiles.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import os

from rest_framework.status import HTTP_200_OK, HTTP_201_CREATED

here = os.path.dirname(os.path.abspath(__file__))


def test_csv_upload(admin_client, json_mock_corpus):
fp = os.path.join(here, 'files', 'example.csv')

# Test file upload
with open(fp, 'rb') as f:
data = {'corpus': json_mock_corpus.pk, 'is_sample': True, 'file': f}
res = admin_client.post('/api/corpus/datafiles/', data)
assert res.status_code == HTTP_201_CREATED
file_pk = res.data.get('id')

# Test file info
info_res = admin_client.get(f'/api/corpus/datafiles/{file_pk}/info/')
assert info_res.status_code == HTTP_200_OK
assert info_res.data == {
'character': 'text',
'line': 'text',
'date-column': 'date',
'FLOAT COLUMN': 'float',
'int_column': 'integer',
'bool column': 'boolean'
}
8 changes: 8 additions & 0 deletions backend/addcorpus/tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from addcorpus.utils import is_date


def test_is_date():
assert is_date('2024-01-01')
assert not is_date(None)
assert not is_date(5)
assert not is_date('01-01-2024')
41 changes: 41 additions & 0 deletions backend/addcorpus/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import datetime
import os
from typing import Dict, Optional, Union

import numpy as np
import pandas as pd


def get_csv_info(path: Union[str, os.PathLike], **kwargs) -> Dict:
df = pd.read_csv(path, **kwargs)
info = {
col_name: map_col(df[col_name]) for col_name in df.columns
}
return info


def map_col(col: pd.Series) -> str:
if col.dtypes == object:
if is_date_col(col):
return 'date'
return 'text'
elif col.dtypes == np.float64:
return 'float'
elif col.dtypes == np.int64:
return 'integer'
elif col.dtypes == bool:
return 'boolean'
return 'text'


def is_date_col(col: pd.Series) -> bool:
mask = col.transform(is_date)
return mask.all()
JeltevanBoheemen marked this conversation as resolved.
Show resolved Hide resolved


def is_date(input: str) -> Optional[datetime.datetime]:
try:
datetime.datetime.strptime(input, '%Y-%m-%d')
return True
except (ValueError, TypeError):
return False
56 changes: 46 additions & 10 deletions backend/addcorpus/views.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,25 @@
from rest_framework.views import APIView
from addcorpus.serializers import CorpusSerializer, CorpusDocumentationPageSerializer, CorpusJSONDefinitionSerializer
from addcorpus.python_corpora.load_corpus import corpus_dir, load_corpus_definition
import os

from addcorpus.models import (Corpus, CorpusConfiguration, CorpusDataFile,
CorpusDocumentationPage)
from addcorpus.permissions import (CanSearchCorpus, IsCurator, IsCuratorOrReadOnly,
corpus_name_from_request)
from addcorpus.python_corpora.load_corpus import (corpus_dir)
from addcorpus.serializers import (CorpusDataFileSerializer,
CorpusDocumentationPageSerializer,
CorpusJSONDefinitionSerializer,
CorpusSerializer)
from addcorpus.utils import get_csv_info
from django.conf import settings
from django.http.response import FileResponse
from addcorpus.permissions import (
CanSearchCorpus, corpus_name_from_request, IsCurator,
IsCuratorOrReadOnly)
from rest_framework.exceptions import NotFound
from rest_framework import viewsets
from addcorpus.models import Corpus, CorpusConfiguration, CorpusDocumentationPage
from rest_framework.decorators import action
from rest_framework.exceptions import NotFound
from rest_framework.permissions import (IsAuthenticated)
from rest_framework.response import Response
from rest_framework.status import HTTP_200_OK
from rest_framework.views import APIView

from django.conf import settings

class CorpusView(viewsets.ReadOnlyModelViewSet):
'''
Expand All @@ -36,7 +45,7 @@ def get_queryset(self):
if self.request.user.is_staff:
corpora = Corpus.objects.all()
else:
corpora = self.request.user.searchable_corpora()
corpora = self.request.user.searchable_corpora()

queried_corpus = self.request.query_params.get('corpus')
if queried_corpus:
Expand Down Expand Up @@ -88,3 +97,30 @@ class CorpusDefinitionViewset(viewsets.ModelViewSet):

def get_queryset(self):
return Corpus.objects.filter(has_python_definition=False)


class CorpusDataFileViewSet(viewsets.ModelViewSet):
permission_classes = [IsAuthenticated]
serializer_class = CorpusDataFileSerializer

def get_queryset(self):
queryset = CorpusDataFile.objects.all()

corpus = self.request.query_params.get('corpus')
if corpus:
queryset = queryset.filter(corpus=corpus)

samples = self.request.query_params.get('samples', False)
if samples:
queryset = queryset.filter(is_sample=True)

return queryset.order_by('created')

@action(detail=True, methods=['get'])
def info(self, request, pk):
obj = self.get_object()
delimiter = obj.corpus.configuration_obj.source_data_delimiter

info = get_csv_info(obj.file.path, sep=delimiter if delimiter else ',')

return Response(info, HTTP_200_OK)
2 changes: 1 addition & 1 deletion backend/ianalyzer/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@
},
}

MEDIA_ROOT = 'data'
MEDIA_ROOT = os.path.join(BASE_DIR, 'data')

# This needs to be the last line of the settings.py, so that all settings can be overridden.
try:
Expand Down
Loading
Loading