Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/corpus form #1659

Open
wants to merge 53 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
53 commits
Select commit Hold shift + click to select a range
e9f91bf
Allow corpus create with only title
JeltevanBoheemen Jul 31, 2024
13d10f1
Scaffold definition form in frontend
JeltevanBoheemen Aug 6, 2024
b5ef920
CorpusDataFile model and views
JeltevanBoheemen Aug 6, 2024
f57e3f6
Backend view for csv info
JeltevanBoheemen Aug 7, 2024
12ec2f5
Adapt backend field types to frontend model
JeltevanBoheemen Aug 13, 2024
0555989
Merge branch 'develop' into feature/corpus-form
JeltevanBoheemen Aug 13, 2024
e3cd17d
Fix corpus form tests
JeltevanBoheemen Aug 13, 2024
538203f
Show datafiles in corpus admin
JeltevanBoheemen Aug 13, 2024
14c627d
Upload sample data frontend
JeltevanBoheemen Aug 14, 2024
fe587e2
Set csv delimiter and read csv with it
JeltevanBoheemen Aug 14, 2024
f5f2f3c
Rename ApiCorpusField in CorpusDefinition
JeltevanBoheemen Aug 14, 2024
f513131
Polish corpus form
JeltevanBoheemen Aug 15, 2024
578ed39
Make fields from sample data
JeltevanBoheemen Aug 15, 2024
4a38f87
Single field form
JeltevanBoheemen Aug 21, 2024
6edf6c1
First version of fields form
JeltevanBoheemen Aug 29, 2024
90d7ab9
Dropdown for language selection
JeltevanBoheemen Sep 13, 2024
b62591d
Dropdown for corpus categories
JeltevanBoheemen Sep 13, 2024
c185280
Populate sample data info from database
JeltevanBoheemen Sep 13, 2024
1192591
Navigate to form after creation, else overview
JeltevanBoheemen Sep 13, 2024
ac27a66
Activate steps when fields already present
JeltevanBoheemen Sep 13, 2024
d2479b5
Provide some help text for fields
JeltevanBoheemen Sep 13, 2024
2741786
Merge branch 'develop' into feature/corpus-form
JeltevanBoheemen Sep 13, 2024
3a09ad9
Remove debug routes
JeltevanBoheemen Sep 13, 2024
a41a4b0
Use date input fieldsfor metadata
JeltevanBoheemen Sep 19, 2024
f6bab2e
Rename 'submit corpus' to 'save changes'
JeltevanBoheemen Sep 19, 2024
12fcade
Use empty array for languages instead of null
JeltevanBoheemen Sep 19, 2024
04907a4
Include field name and extract in form
JeltevanBoheemen Oct 1, 2024
19296e9
Remove the 'hidden' option in field form
JeltevanBoheemen Oct 1, 2024
3ae11d2
Show help texts for field types
JeltevanBoheemen Oct 1, 2024
bdf4ce6
Add boolean field type
JeltevanBoheemen Oct 1, 2024
a10906e
Conditionally show language picker
JeltevanBoheemen Oct 1, 2024
b8e190b
Rename edit to InOut, form to edit
JeltevanBoheemen Oct 3, 2024
ecb814d
Fix frontend test imports
JeltevanBoheemen Oct 3, 2024
68d4867
fix tests
lukavdplas Nov 6, 2024
3ce0b7e
Resolve minor form issues
JeltevanBoheemen Dec 10, 2024
14a5b4a
Merge branch 'develop' into feature/corpus-form
JeltevanBoheemen Dec 10, 2024
2787872
Rename migration
JeltevanBoheemen Dec 10, 2024
6c6f28a
Make is_date_col work with missing values
JeltevanBoheemen Dec 10, 2024
bb3cac5
Serialize datafile name instead of path
JeltevanBoheemen Dec 10, 2024
168ad63
Rename toggleStep to toggleStepDisabled
JeltevanBoheemen Dec 10, 2024
3edef17
add delimiter type
JeltevanBoheemen Dec 10, 2024
172a98f
use cloneDeep for nested objects
JeltevanBoheemen Dec 10, 2024
1023af9
Reword create from scratch
JeltevanBoheemen Dec 10, 2024
5880510
Clean up CorpusFormComponent template
JeltevanBoheemen Dec 10, 2024
0c3fe08
hide filter option for text content
JeltevanBoheemen Dec 10, 2024
8539528
Show/hide field form options for text fields
JeltevanBoheemen Dec 10, 2024
42b4527
Use paragraph elements for subtitles
JeltevanBoheemen Dec 10, 2024
d3bd57a
Use ISO 639-3 for language picker
JeltevanBoheemen Dec 10, 2024
05342e7
Language picker for meta form
JeltevanBoheemen Dec 10, 2024
34ad8b1
Remove dangercolor from reset sample button
JeltevanBoheemen Dec 10, 2024
c0a22c4
Use a table for sample component
JeltevanBoheemen Dec 10, 2024
57eadd1
Show column name in field form
JeltevanBoheemen Dec 11, 2024
ac41163
Extra help texts or corpus form
JeltevanBoheemen Dec 11, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion backend/addcorpus/admin.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from django.contrib import admin, messages
from .models import Corpus, CorpusConfiguration, Field, CorpusDocumentationPage
from .models import Corpus, CorpusConfiguration, CorpusDataFile, Field, CorpusDocumentationPage

def show_warning_message(request):
'''
Expand All @@ -14,6 +14,12 @@ def show_warning_message(request):
)


class InlineDatafileAdmin(admin.StackedInline):
model = CorpusDataFile
fields = ['file', 'is_sample']
show_change_link = True,
extra = 0

class CorpusAdmin(admin.ModelAdmin):
readonly_fields = [
'configuration', 'ready_to_index', 'ready_to_publish', 'date_created',
Expand All @@ -24,13 +30,15 @@ class CorpusAdmin(admin.ModelAdmin):
]
list_display = ['name', 'active']
list_filter = ['groups', 'active']
inlines = [InlineDatafileAdmin]

class InlineFieldAdmin(admin.StackedInline):
model = Field
fields = ['display_name', 'description']
show_change_link = True
extra = 0


class CorpusConfigurationAdmin(admin.ModelAdmin):
readonly_fields = ['corpus']

Expand Down
6 changes: 6 additions & 0 deletions backend/addcorpus/json_corpora/constants.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,8 @@
from datetime import date


DEFAULT_CSV_DELIMITER = ','
DATE_FORMAT = '%Y-%m-%d'

DEFAULT_MIN_DATE = date(1800, 1, 1)
DEFAULT_MAX_DATE = date.today()
15 changes: 8 additions & 7 deletions backend/addcorpus/json_corpora/import_json.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
from typing import List, Dict, Iterable, Optional
from datetime import datetime
from datetime import date, datetime


from addcorpus.models import Corpus, CorpusConfiguration, Field
from addcorpus.models import Field
from addcorpus.json_corpora.utils import get_path
from addcorpus import es_mappings
from addcorpus.constants import VisualizationType
from addcorpus.validation.publishing import _any_date_fields
from django.conf import settings
from addcorpus.json_corpora.constants import DEFAULT_CSV_DELIMITER, DATE_FORMAT
from addcorpus.json_corpora.constants import DEFAULT_CSV_DELIMITER, DATE_FORMAT, DEFAULT_MAX_DATE, DEFAULT_MIN_DATE

def import_json_corpus(data: Dict) -> Dict:
name = get_path(data, 'name')
Expand All @@ -33,9 +32,9 @@ def _parse_configuration(data: Dict) -> Dict:
'es_index': create_index_name(get_path(data, 'name')),
'languages': get_path(data, 'meta', 'languages'),
'min_date': _parse_date(
get_path(data, 'meta', 'date_range', 'min')),
get_path(data, 'meta', 'date_range', 'min'), DEFAULT_MIN_DATE),
'max_date': _parse_date(
get_path(data, 'meta', 'date_range', 'max')),
get_path(data, 'meta', 'date_range', 'max'), DEFAULT_MAX_DATE),
'default_sort': get_path(
data, 'options', 'default_sort') or {},
'language_field': get_path(
Expand All @@ -48,7 +47,9 @@ def _parse_configuration(data: Dict) -> Dict:
}


def _parse_date(date: str):
def _parse_date(date: Optional[str], fallback: Optional[date]):
if not date:
return fallback
return datetime.strptime(date, DATE_FORMAT).date()


Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# Generated by Django 4.2.17 on 2024-12-10 13:25

import addcorpus.models
import addcorpus.validation.creation
import django.contrib.postgres.fields
from django.db import migrations, models
import django.db.models.deletion


class Migration(migrations.Migration):

dependencies = [
('addcorpus', '0026_corpus_date_created'),
]

operations = [
migrations.AlterField(
model_name='corpusconfiguration',
name='category',
field=models.CharField(blank=True, choices=[('parliament', 'Parliamentary debates'), ('periodical', 'Newspapers and other periodicals'), ('finance', 'Financial reports'), ('ruling', 'Court rulings'), ('review', 'Online reviews'), ('inscription', 'Funerary inscriptions'), ('oration', 'Orations'), ('book', 'Books'), ('informative', 'Informative')], help_text='category/medium of documents in this dataset', max_length=64, null=True),
),
migrations.AlterField(
model_name='corpusconfiguration',
name='description',
field=models.CharField(blank=True, help_text='short description of the corpus', max_length=254, null=True),
),
migrations.AlterField(
model_name='corpusconfiguration',
name='languages',
field=django.contrib.postgres.fields.ArrayField(base_field=models.CharField(blank=True, max_length=8, validators=[addcorpus.validation.creation.validate_language_code]), blank=True, help_text='languages used in the content of the corpus (from most to least frequent)', size=None),
),
migrations.CreateModel(
name='CorpusDataFile',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('file', models.FileField(help_text='file containing corpus data', upload_to=addcorpus.models.CorpusDataFile.upload_path)),
('is_sample', models.BooleanField(default=False, help_text='This file is used in creating the corpus definition, it may additonaly reflect (part of) the actual data.')),
('created', models.DateTimeField(auto_now_add=True)),
('corpus', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='addcorpus.corpus')),
],
),
]
20 changes: 20 additions & 0 deletions backend/addcorpus/models.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
import warnings

from django.contrib import admin
Expand Down Expand Up @@ -178,10 +179,13 @@ class CorpusConfiguration(models.Model):
max_length=64,
choices=CATEGORIES,
help_text='category/medium of documents in this dataset',
blank=True,
null=True
)
description = models.CharField(
max_length=MAX_LENGTH_DESCRIPTION,
blank=True,
null=True,
help_text='short description of the corpus',
)
document_context = models.JSONField(
Expand Down Expand Up @@ -211,6 +215,7 @@ class CorpusConfiguration(models.Model):
blank=True,
),
help_text='languages used in the content of the corpus (from most to least frequent)',
blank=True,
)
min_date = models.DateField(
help_text='earliest date for the data in the corpus',
Expand Down Expand Up @@ -514,3 +519,18 @@ class Meta:
name='unique_documentation_type_for_corpus'
)
]


class CorpusDataFile(models.Model):
def upload_path(self, filename):
return os.path.join('corpus_datafiles', f'{self.corpus.pk}', filename)

corpus = models.ForeignKey(to=Corpus, on_delete=models.CASCADE)
file = models.FileField(upload_to=upload_path,
help_text='file containing corpus data')
is_sample = models.BooleanField(
default=False, help_text='This file is used in creating the corpus definition, it may additonaly reflect (part of) the actual data.')
created = models.DateTimeField(auto_now_add=True)

def __str__(self):
return f'{self.file.name}'
33 changes: 27 additions & 6 deletions backend/addcorpus/serializers.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
from rest_framework import serializers
from typing import Dict

from addcorpus.models import Corpus, CorpusConfiguration, Field, CorpusDocumentationPage
from addcorpus.constants import CATEGORIES
from langcodes import Language, standardize_tag
from addcorpus.documentation import render_documentation_context
from addcorpus.json_corpora.export_json import export_json_corpus
from addcorpus.json_corpora.import_json import import_json_corpus
from addcorpus.models import (Corpus, CorpusConfiguration, CorpusDataFile,
CorpusDocumentationPage, Field)
from django.core.files import File
from langcodes import Language, standardize_tag
from rest_framework import serializers


class NonEmptyJSONField(serializers.JSONField):
Expand Down Expand Up @@ -197,9 +199,12 @@ def update(self, instance: Corpus, validated_data: Dict):
configuration.save()

for field_data in fields_data:
field, _ = Field.objects.get_or_create(
corpus_configuration=configuration, name=field_data['name']
)
try:
field = Field.objects.get(
corpus_configuration=configuration, name=field_data['name'])
except Field.DoesNotExist:
field = Field(corpus_configuration=configuration,
name=field_data['name'])
for attr in field_data:
setattr(field, attr, field_data[attr])
field.save()
Expand All @@ -211,3 +216,19 @@ def update(self, instance: Corpus, validated_data: Dict):
corpus.save()

return corpus


class DataFileField(serializers.FileField):
def to_representation(self, value: File) -> Dict:
return value.name

def to_internal_value(self, data):
return super().to_internal_value(data)


class CorpusDataFileSerializer(serializers.ModelSerializer):
file = DataFileField()

class Meta:
model = CorpusDataFile
fields = ('id', 'corpus', 'file', 'created', 'is_sample')
11 changes: 11 additions & 0 deletions backend/addcorpus/tests/files/example.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
character,line,date-column,FLOAT COLUMN,int_column,bool column
"HAMLET","Whither wilt thou lead me? Speak, I'll go no further.","1256-10-08","7.03","13","True"
"GHOST","Mark me.","1435-10-07","4.74","89","True"
"HAMLET","I will.","1463-07-16","5.55","29","True"
"GHOST","My hour is almost come,","1634-08-09","3.04","100","True"
"GHOST","When I to sulph'rous and tormenting flames","1982-09-01","6.73","34","True"
"GHOST","Must render up myself.","1756-11-22","-0.58","-12","False"
"HAMLET","Alas, poor ghost!","1200-09-05","9.38","6","False"
"GHOST","Pity me not, but lend thy serious hearing","1633-11-18","8.84","83","False"
"GHOST","To what I shall unfold.","1445-11-09","3.6","97","False"
"HAMLET","Speak, I am bound to hear.","1984-08-12","-1.89","-4","False"
28 changes: 28 additions & 0 deletions backend/addcorpus/tests/test_datafiles.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import os

from rest_framework.status import HTTP_200_OK, HTTP_201_CREATED

here = os.path.dirname(os.path.abspath(__file__))


def test_csv_upload(admin_client, json_mock_corpus):
fp = os.path.join(here, 'files', 'example.csv')

# Test file upload
with open(fp, 'rb') as f:
data = {'corpus': json_mock_corpus.pk, 'is_sample': True, 'file': f}
res = admin_client.post('/api/corpus/datafiles/', data)
assert res.status_code == HTTP_201_CREATED
file_pk = res.data.get('id')

# Test file info
info_res = admin_client.get(f'/api/corpus/datafiles/{file_pk}/info/')
assert info_res.status_code == HTTP_200_OK
assert info_res.data == {
'character': 'text',
'line': 'text',
'date-column': 'date',
'FLOAT COLUMN': 'float',
'int_column': 'integer',
'bool column': 'boolean'
}
19 changes: 19 additions & 0 deletions backend/addcorpus/tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from addcorpus.utils import is_date, is_date_col
import pandas as pd


def test_is_date():
assert is_date('2024-01-01')
assert not is_date(None)
assert not is_date(5)
assert not is_date('01-01-2024')


def test_is_date_col():
clean_date_series = pd.Series(['1800-01-01', '2024-01-01'])
dirty_date_series = pd.concat([clean_date_series, pd.Series([None, ''])])
empty_series = pd.Series([None, None])

assert is_date_col(clean_date_series)
assert is_date_col(dirty_date_series)
assert not is_date_col(empty_series)
48 changes: 48 additions & 0 deletions backend/addcorpus/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import datetime
import os
from typing import Dict, Optional, Union

import numpy as np
import pandas as pd


def get_csv_info(path: Union[str, os.PathLike], **kwargs) -> Dict:
df = pd.read_csv(path, **kwargs)
info = {
col_name: map_col(df[col_name]) for col_name in df.columns
}
return info


def map_col(col: pd.Series) -> str:
if col.dtypes == object:
if is_date_col(col):
return 'date'
return 'text'
elif col.dtypes == np.float64:
return 'float'
elif col.dtypes == np.int64:
return 'integer'
elif col.dtypes == bool:
return 'boolean'
return 'text'


def is_date_col(col: pd.Series) -> bool:
'''Check if a column only contains dates or missing values
Converts empty strings to None because they are non picked up by `isna()`
'''
non_null = col.replace('', None)
non_null = non_null[~non_null.isna()]
if non_null.empty:
return False
mask = non_null.transform(is_date)
return mask.all()


def is_date(input: str) -> Optional[datetime.datetime]:
try:
datetime.datetime.strptime(input, '%Y-%m-%d')
return True
except (ValueError, TypeError):
return False
Loading
Loading