Automatically compute the number of lines per work.

The table of line totals formerly hardcoded the WORKS from Table 1 of "SEDES: Metrical Position in Greek Hexameter". But there have been changes to the corpus since then that affect line numbering, for example sasansom/sedes#77 sasansom/sedes#79 sasansom/sedes@04dd4a1 Furthermore, Table 1 from the "SEDES" article is produced using an xmlstarlet command running on the source TEI directly counting l and lb elements, not on the derived CSV files. In our notes for the table we remark that this is because duplicate line numbers cause the counts to come out too low: For future reference: $ (echo "work,lines"; for a in corpus/*.xml; do echo "$a,$(xmlstarlet sel -t -m '//l' -v '"l"' -n -t -m '//lb' -v '"lb"' -n "$a" | wc -l)"; done) > corpus.csv > x <- read.csv("corpus.csv") > sum(x$lines) [1] 73098 > summary(x$lines) Min. 1st Qu. Median Mean 3rd Qu. Max. 479 1017 2434 6092 9628 21356 --- Table 1 numbers checked 2022-09-17, sedes commit cf795ef740. --- > x <- bind_rows(map_dfr(Sys.glob("corpus/*.csv"), read_csv, col_types = cols(line_n = col_character(), book_n = col_character()))) > x %>% group_by(work) %>% summarize(n = n()) NB the line counts you get from counting distinct line numbers in the CSV are slightly different (smaller) from what you get from xmlstarlet, because of duplicated line numbers. > x %>% select(work, book_n, line_n) %>% unique %>% nrow [1] 72954 > x %>% select(work, book_n, line_n) %>% unique %>% group_by(work) %>% summarize(n = n()) In this repository I've started adding a workaround for the duplicate line numbers, counting up a line whenever word_n fails to increase with the same work, book_n, and line_n in input order. But even with that, the automatically determined counts for Callim.Hymn and Q.S. are 1 smaller than they used to be, and unlike Dion. and Theoc., we have not made changes to those texts that should affect line count totals. I am planning to look at those more closely, but for now, go ahead with the automatically computed line numbers, because that's what all our percentages etc. are based on. If I repeat the xmlstarlet calculation with current SEDES files (605a27b3af22089379aad22ba96edf113970a7b0), the only change I get is 3 fewer lines in Dion. Using the automatically determined line numbers takes it down another 99 lines across 4 works. work old_num_lines redo_old_num_lines diff1 new_num_lines diff2 <chr> <dbl> <dbl> <dbl> <int> <dbl> 1 Phaen. 1155 1155 0 1155 0 2 Argon. 5834 5834 0 5834 0 3 Callim.Hymn 941 941 0 940 -1 4 Hom.Hymn 2342 2342 0 2342 0 5 Il. 15683 15683 0 15683 0 6 Dion. 21356 21353 -3 21259 -97 7 Od. 12107 12107 0 12107 0 8 Q.S. 8801 8801 0 8800 -1 9 Sh. 479 479 0 479 0 10 Theoc. 2527 2527 0 2524 -3 11 Theog. 1042 1042 0 1042 0 12 W.D. 831 831 0 831 0 13 total 73098 73095 -3 72996 -102
sasansom · Jun 1, 2023 · 3c12888 · 3c12888
1 parent 570720c
commit 3c12888
Show file tree

Hide file tree

Showing 5 changed files with 44 additions and 19 deletions.
diff --git a/HB_Database_Predraft.r b/HB_Database_Predraft.r
@@ -4,19 +4,19 @@ library("cowplot")
 WIDTH <- 6 # in
 
 WORKS <- tribble(
-	~work,         ~num_lines, ~date, ~work_name,
-	"Argon.",            5834,  -350, "Argonautica",
-	"Callim.Hymn",        941,  -250, "Callimachus’ Hymns",
-	"Dion.",            21356,   450, "Nonnus’ Dionysiaca",
-	"Hom.Hymn",          2342,  -600, "Homeric Hymns",
-	"Il.",              15683,  -750, "Iliad",
-	"Od.",              12107,  -750, "Odyssey",
-	"Phaen.",            1155,  -250, "Aratus’ Phaenomena",
-	"Q.S.",              8801,   350, "Quintus of Smyrna’s Fall of Troy",
-	"Sh.",                479,  -550, "Shield",
-	"Theoc.",            2527,  -250, "Theocritus’ Idylls",
-	"Theog.",            1042,  -750, "Theogony",
-	"W.D.",               831,  -750, "Works and Days",
+	~work,         ~date, ~work_name,
+	"Argon.",       -350, "Argonautica",
+	"Callim.Hymn",  -250, "Callimachus’ Hymns",
+	"Dion.",         450, "Nonnus’ Dionysiaca",
+	"Hom.Hymn",     -600, "Homeric Hymns",
+	"Il.",          -750, "Iliad",
+	"Od.",          -750, "Odyssey",
+	"Phaen.",       -250, "Aratus’ Phaenomena",
+	"Q.S.",          350, "Quintus of Smyrna’s Fall of Troy",
+	"Sh.",          -550, "Shield",
+	"Theoc.",       -250, "Theocritus’ Idylls",
+	"Theog.",       -750, "Theogony",
+	"W.D.",         -750, "Works and Days",
 )
 
 # TODO: fix off-by-one in BCE dates.
@@ -36,6 +36,30 @@ theme_set(
 )
 update_geom_defaults("text", aes(family = FONT_FAMILY))
 
+# Read sedes/joined.all.csv just to get the total number of lines per work.
+num_lines <- read_csv("sedes/joined.all.csv",
+	col_types = cols_only(
+		work = col_factor(),
+		book_n = col_character(),
+		line_n = col_character(),
+		word_n = col_integer()
+	)
+) %>%
+	# Add an index to the original lines, in order to restore original
+	# ordering after summarization. This also disambiguates cases of
+	# duplicate line numbers: we consider it a line break whenever word_n
+	# does not increase--otherwise all the words in the lines with repeated
+	# line numbers would be considered part of the same line.
+	mutate(idx = cumsum(replace_na(
+		!(work == lag(work) & book_n == lag(book_n) & line_n == lag(line_n) & word_n > lag(word_n)),
+	TRUE))) %>%
+	group_by(idx) %>%
+	summarize(
+		across(c(work, book_n, line_n), first),
+		.groups = "drop"
+	) %>%
+	count(work, name = "num_lines")
+
 # Read input and tidy.
 data <- read_csv(
 	"HB_Database_Predraft.csv",
@@ -77,8 +101,9 @@ break_rates <- data %>%
 		.groups = "drop"
 	) %>%
 
-	# Join with table of per-work metadata.
+	# Join with tables of per-work metadata.
 	left_join(WORKS, by = c("work")) %>%
+	left_join(num_lines, by = c("work")) %>%
 
 	# Sort in decreasing order by break rate, then by ascending by date,
 	# then ascending by work name.

diff --git a/Makefile b/Makefile
@@ -23,7 +23,7 @@ break_rates_over_time.png \
 breaks_vs_caesurae_rates.png \
 clusters.png \
 speaker_frequency.csv \
-: .EXTRA_PREREQS = HB_Database_Predraft.r
+: .EXTRA_PREREQS = HB_Database_Predraft.r sedes/joined.all.csv
 break_rates.csv \
 break_rates_over_time.png \
 clusters.png \

diff --git a/break_rates.csv b/break_rates.csv
@@ -1,13 +1,13 @@
 Work,Lines,Caesurae,Breaks,Breaks/Line,_
 Aratus’ Phaenomena,"1,155",74,5, 0.43%,(1 per 231)
 Homeric Hymns,"2,342",77,8, 0.34%,(1 per 293)
-Theocritus’ Idylls,"2,527",130,7, 0.28%,(1 per 361)
+Theocritus’ Idylls,"2,524",130,7, 0.28%,(1 per 361)
 Iliad,"15,683",725,41, 0.26%,(1 per 383)
 Odyssey,"12,107",572,29, 0.24%,(1 per 417)
 Shield,479,21,1, 0.21%,(1 per 479)
 Works and Days,831,50,1, 0.12%,(1 per 831)
-Quintus of Smyrna’s Fall of Troy,"8,801",402,10, 0.11%,(1 per 880)
+Quintus of Smyrna’s Fall of Troy,"8,800",402,10, 0.11%,(1 per 880)
 Theogony,"1,042",40,1, 0.10%,"(1 per 1,042)"
 Argonautica,"5,834",199,0, 0%,
-Callimachus’ Hymns,941,29,0, 0%,
-Nonnus’ Dionysiaca,"21,356",157,0, 0%,
+Callimachus’ Hymns,940,29,0, 0%,
+Nonnus’ Dionysiaca,"21,259",157,0, 0%,
diff --git a/break_rates_over_time.png b/break_rates_over_time.png
diff --git a/breaks_vs_caesurae_rates.png b/breaks_vs_caesurae_rates.png