From e66bd13be4ebd27e92b4be846a18c862561dbffc Mon Sep 17 00:00:00 2001
From: missinglink <insomnia@rcpt.at>
Date: Tue, 1 Sep 2020 11:08:47 +0200
Subject: [PATCH 1/5] wip: parser interpretation improvements

---
 sanitizer/_text_pelias_parser.js           | 80 ++++++++++++++++------
 test/unit/sanitizer/_text_pelias_parser.js | 43 +++++++++---
 2 files changed, 93 insertions(+), 30 deletions(-)

diff --git a/sanitizer/_text_pelias_parser.js b/sanitizer/_text_pelias_parser.js
index 89f547d47..4aed45c03 100644
--- a/sanitizer/_text_pelias_parser.js
+++ b/sanitizer/_text_pelias_parser.js
@@ -7,6 +7,11 @@ const parser = new AddressParser();
 const _ = require('lodash');
 const MAX_TEXT_LENGTH = 140;
 
+// this constant defines a lower boundary for the solution score returned
+// by the Pelias parser. Any solutions which scored lower than this value
+// will simply have their entire body returned as the $subject
+const MIN_ACCEPTABLE_SCORE = 0.3;
+
 /**
   this module provides fulltext parsing using the pelias/parser module.
   see: https://github.com/pelias/parser
@@ -52,8 +57,36 @@ function _sanitize (raw, clean) {
   return messages;
 }
 
+/**
+  The Pelias parser is responsible for interpreting an input string and then returning
+  one or more solutions for how the input tokens can been logically classified.
+
+  However, the responsibility of *how* to interpret those solutions and how best
+  to map them to variables we can use to query elasticsearch is the responsibility
+  of the function below.
+
+  The general idea is to split tokens along a 'cursor boundary', so on the left side of
+  the cursor we have the 'subject' of the query (ie. the place in question) and on the
+  right side of the query we have the administrative tokens.
+
+  note: The function below is *only* concerned with selecting $parsed_text.subject and
+  $parsed_text.admin values
+
+  see: `query/text_parser_pelias.js` for the logic which adds the other fields such
+  as $parsed_text.housenumber and $parsed_text.street etc. *before* this gets run.
+
+  Postcodes are a special case, they belong to neither the admin hierarchy nor to the
+  subject itself. They are more similar in that sense to a house number, because they
+  are a property of the address. They are tricky because they usually appear somewhere
+  in the middle of the input text. They shouldn't be included in either the subject or
+  the admin parts, unless the postcode is itself the subject of the query.
+
+  Here are some examples of how we might split a query into subject and admin:
+  "100 Foo Street Brookyln NYC" -> ["100 Foo Street", "Brookyln NYC"]
+  "Foo Bar 10017 Berlin Germany" -> ["Foo Bar", "Berlin Germany"]
+ */
 function parse (clean) {
-  
+
   // parse text
   let start = new Date();
   const t = new Tokenizer(clean.text);
@@ -73,7 +106,7 @@ function parse (clean) {
   let solution = new Solution();
   if (t.solution.length) { solution = t.solution[0]; }
 
-  // 1. map the output of the parser in to parsed_text
+  // 1. map the output of the parser into $parsed_text
   let parsed_text = { subject: undefined };
 
   solution.pair.forEach(p => {
@@ -100,6 +133,9 @@ function parse (clean) {
     .map((c, i) => (mask[i] !== 'P') ? c : ' ')
     .join('');
 
+  // same as $body above but with consecutive whitespace squashed and trimmed.
+  const normalizedBody = t.section.map(sp => sp.body).join(' ').replace(/\s+/g, ' ').trim();
+
   // scan through the input text and 'bucket' characters in to one of two buckets:
   // prefix: all unparsed characters that came before any parsed fields
   // postfix: all characters from the first admin field to the end of the string
@@ -107,26 +143,21 @@ function parse (clean) {
   // set cursor to the first classified character from selected classes
   let cursor = mask.search(/[NSAP]/);
 
-  // >> solution includes venue classification
-  // set cursor after the venue name
-  if (mask.includes('V')) { cursor = mask.lastIndexOf('V') +1; }
-
-  if (cursor === -1) { cursor = body.length; }
-  let prefix = _.trim(body.substr(0, cursor), ' ,');
-
-  // solution includes address classification
+  // if solution includes address classification
   // set cursor after the last classified address character
-  if (mask.search(/[NS]/) > -1) {
+  if ( mask.includes('N') && mask.includes('S') ) {
     cursor = Math.max(mask.lastIndexOf('N'), mask.lastIndexOf('S')) + 1;
   }
-  // solution includes admin classification
+
+  // else if solution includes admin classification
   // set cursor to the first classified admin character
   else if( mask.includes('A') ){ cursor = mask.indexOf('A'); }
-  // >> solution includes venue classification
-  // set cursor after the venue name
-  else if (mask.includes('V')) { cursor = mask.lastIndexOf('V') + 1; }
+
   // else set cursor to end-of-text
   else { cursor = body.length; }
+
+  if (cursor === -1) { cursor = body.length; }
+  let prefix = _.trim(body.substr(0, cursor), ' ,');
   let postfix = _.trim(body.substr(cursor), ' ,');
 
   // clean up spacing around commas
@@ -163,8 +194,13 @@ function parse (clean) {
   // 4. set 'subject', this is the text which will target the 'name.*'
   // fields in elasticsearch queries
 
+  // in the case where the solution score is very low we simply use the entire
+  // input as the $subject.
+  if ( solution.score < MIN_ACCEPTABLE_SCORE ) {
+    parsed_text = { subject: normalizedBody };
+  }
   // an address query
-  if (!_.isEmpty(parsed_text.housenumber) && !_.isEmpty(parsed_text.street)) {
+  else if (!_.isEmpty(parsed_text.housenumber) && !_.isEmpty(parsed_text.street)) {
     parsed_text.subject = `${parsed_text.housenumber} ${parsed_text.street}`;
   }
   // an intersection query
@@ -172,12 +208,16 @@ function parse (clean) {
     parsed_text.subject = `${parsed_text.street} & ${parsed_text.cross_street}`;
   }
   // a street query
-  else if (!_.isEmpty(parsed_text.street)) {
+  else if (!_.isEmpty(parsed_text.street) && _.isEmpty(parsed_text.venue)) {
     parsed_text.subject = parsed_text.street;
   }
   // query with a $prefix such as a venue query
   else if (!_.isEmpty(prefix)){
-    parsed_text.subject = prefix;
+    if (prefix.length > 2) {
+      parsed_text.subject = prefix;
+    } else {
+      parsed_text = { subject: normalizedBody };
+    }
   }
   // a postcode query
   else if (!_.isEmpty(parsed_text.postcode)) {
@@ -225,10 +265,10 @@ function parse (clean) {
       }
     }
   }
-  
+
   // unknown query type
   else {
-    parsed_text.subject = t.span.body;
+    parsed_text = { subject: normalizedBody };
   }
 
   return parsed_text;
diff --git a/test/unit/sanitizer/_text_pelias_parser.js b/test/unit/sanitizer/_text_pelias_parser.js
index a169d5d93..8e17ac11c 100644
--- a/test/unit/sanitizer/_text_pelias_parser.js
+++ b/test/unit/sanitizer/_text_pelias_parser.js
@@ -20,7 +20,7 @@ module.exports.tests.text_parser = function (test, common) {
   });
 
   let cases = [];
-  
+
   // USA queries
   cases.push(['soho, new york, NY', {
     subject: 'soho',
@@ -154,6 +154,29 @@ module.exports.tests.text_parser = function (test, common) {
     admin: 'Kentucky'
   }]);
 
+  // badly parsed venue names should at least have correct $subject
+  cases.push(['Kells Irish Restaurant & Pub', {
+    subject: 'Kells Irish Restaurant & Pub'
+  }, true]);
+  cases.push(['The Village Zombie', {
+    subject: 'The Village Zombie'
+  }, true]);
+  cases.push(['Chili\'s Bar & Grill', {
+    subject: 'Chili\'s Bar & Grill'
+  }, true]);
+  cases.push(['OYO Hotels & Homes', {
+    subject: 'OYO Hotels & Homes'
+  }, true]);
+  cases.push(['Adamas Pharmaceuticals', {
+    subject: 'Adamas Pharmaceuticals'
+  }, true]);
+  cases.push(['St Francis Extended', {
+    subject: 'St Francis Extended'
+  }, true]);
+  cases.push(['St Francis', {
+    subject: 'St Francis'
+  }, true]);
+
   // street (USA style)
   cases.push(['M', { subject: 'M' }, true]);
   cases.push(['Ma', { subject: 'Ma' }, true]);
@@ -282,11 +305,11 @@ module.exports.tests.text_parser = function (test, common) {
   cases.push(['Kasch', { subject: 'Kasch' }, true]);
   cases.push(['Kaschk', { subject: 'Kaschk' }, true]);
   cases.push(['Kaschk ', { subject: 'Kaschk' }, true]);
-  // cases.push(['Kaschk B', { subject: 'Kaschk' }, true]); // jitter issue
-  cases.push(['Kaschk Be', { subject: 'Kaschk' }, true]);
-  // cases.push(['Kaschk Ber', { subject: 'Kaschk' }, true]); // jitter issue
-  // cases.push(['Kaschk Berl', { subject: 'Kaschk' }, true]); // jitter issue
-  // cases.push(['Kaschk Berli', { subject: 'Kaschk' }, true]); // jitter issue
+  cases.push(['Kaschk B', { subject: 'Kaschk B' }, true]);
+  cases.push(['Kaschk Be', { subject: 'Kaschk Be' }, true]);
+  cases.push(['Kaschk Ber', { subject: 'Kaschk Ber' }, true]);
+  cases.push(['Kaschk Berl', { subject: 'Kaschk Berl' }, true]);
+  cases.push(['Kaschk Berli', { subject: 'Kaschk Berli' }, true]);
   cases.push(['Kaschk Berlin', { subject: 'Kaschk' }, true]);
 
   cases.push(['A', { subject: 'A' }, true]);
@@ -308,7 +331,7 @@ module.exports.tests.text_parser = function (test, common) {
   // cases.push(['Air & Space Museu', { subject: 'Air & Space Museu' }, true]); // jitter issue
   cases.push(['Air & Space Museum', { subject: 'Air & Space Museum' }, true]);
   cases.push(['Air & Space Museum ', { subject: 'Air & Space Museum' }, true]);
-  cases.push(['Air & Space Museum D', { subject: 'Air & Space Museum' }, true]);
+  cases.push(['Air & Space Museum D', { subject: 'Air & Space Museum D' }, true]);
   cases.push(['Air & Space Museum DC', { subject: 'Air & Space Museum' }, true]);
 
   // admin areas
@@ -322,7 +345,7 @@ module.exports.tests.text_parser = function (test, common) {
   cases.push(['New York', { subject: 'New York' }, true]);
   cases.push(['New York N', { subject: 'New York' }, true]);
   cases.push(['New York NY', { subject: 'New York' }, true]);
-  
+
   cases.push(['B', { subject: 'B' }, true]);
   cases.push(['Be', { subject: 'Be' }, true]);
   cases.push(['Ber', { subject: 'Ber' }, true]);
@@ -421,9 +444,9 @@ module.exports.tests.text_parser = function (test, common) {
     const raw = {
       text: `
 Sometimes we make the process more complicated than we need to.
-We will never make a journey of a thousand miles by fretting about 
+We will never make a journey of a thousand miles by fretting about
 how long it will take or how hard it will be.
-We make the journey by taking each day step by step and then repeating 
+We make the journey by taking each day step by step and then repeating
 it again and again until we reach our destination.` };
     const clean = {};
     const messages = sanitizer.sanitize(raw, clean);

From 1084bdf54dcc821d1870a5c4ac7be782e001cd1d Mon Sep 17 00:00:00 2001
From: missinglink <insomnia@rcpt.at>
Date: Tue, 1 Sep 2020 13:40:45 +0200
Subject: [PATCH 2/5] test(parser): add test case for Marks & Spencer

---
 test/unit/sanitizer/_text_pelias_parser.js | 40 ++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/test/unit/sanitizer/_text_pelias_parser.js b/test/unit/sanitizer/_text_pelias_parser.js
index 8e17ac11c..e6887e5ca 100644
--- a/test/unit/sanitizer/_text_pelias_parser.js
+++ b/test/unit/sanitizer/_text_pelias_parser.js
@@ -365,6 +365,46 @@ module.exports.tests.text_parser = function (test, common) {
   cases.push(['Berlin Deutschlan', { subject: 'Berlin' }, true]);
   cases.push(['Berlin Deutschland', { subject: 'Berlin' }, true]);
 
+  // venue name with ampersand
+  // note: this query is ambigious as it could refer to either the
+  // UK high street brand "Marks & Spencer" or an intersection of
+  // Marks St and Spencer Ave.
+  // note: what we're looking for here is that we are using the whole
+  // input as the $subject regardless and without jitter.
+  cases.push(['M', { subject: 'M' }, true]);
+  cases.push(['Ma', { subject: 'Ma' }, true]);
+  cases.push(['Mar', { subject: 'Mar' }, true]);
+  cases.push(['Mark', { subject: 'Mark' }, true]);
+
+  // note: for the following 5 keystrokes the $subject is simplified to 'Marks'
+  cases.push(['Marks', { subject: 'Marks' }, true]);
+  cases.push(['Marks ', { subject: 'Marks' }, true]);
+  cases.push(['Marks &', { subject: 'Marks' }, true]);
+  cases.push(['Marks & ', { subject: 'Marks' }, true]);
+  cases.push(['Marks & S', { subject: 'Marks' }, true]);
+
+  cases.push(['Marks & Sp', { subject: 'Marks & Sp' }, true]);
+  cases.push(['Marks & Spe', { subject: 'Marks & Spe' }, true]);
+  cases.push(['Marks & Spen', { subject: 'Marks & Spen' }, true]);
+  cases.push(['Marks & Spenc', { subject: 'Marks & Spenc' }, true]);
+  cases.push(['Marks & Spence', { subject: 'Marks & Spence' }, true]);
+  cases.push(['Marks & Spencer', { subject: 'Marks & Spencer' }, true]);
+
+  // venue is also known colloquially as "M AND S"
+  cases.push(['M', { subject: 'M' }, true]);
+  cases.push(['M ', { subject: 'M' }, true]);
+  cases.push(['M &', { subject: 'M &' }, true]);
+  cases.push(['M & ', { subject: 'M &' }, true]);
+  cases.push(['M & S', { subject: 'M & S' }, true]);
+
+  cases.push(['M', { subject: 'M' }, true]);
+  cases.push(['M ', { subject: 'M' }, true]);
+  cases.push(['M a', { subject: 'M a' }, true]);
+  cases.push(['M an', { subject: 'M an' }, true]);
+  cases.push(['M and', { subject: 'M and' }, true]);
+  cases.push(['M and ', { subject: 'M and' }, true]);
+  cases.push(['M and S', { subject: 'M & S' }, true]);
+
   // postcodes
   cases.push(['2000', { subject: '2000' }, true]);
   cases.push(['Sydney 2000', { subject: '2000' }, true]);

From 24b87d3e757b09546517361362911ed3ae0c4cd6 Mon Sep 17 00:00:00 2001
From: missinglink <insomnia@rcpt.at>
Date: Tue, 1 Sep 2020 13:43:07 +0200
Subject: [PATCH 3/5] package: pin dev dependency

---
 package.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/package.json b/package.json
index 73f60981b..f30c12ad3 100644
--- a/package.json
+++ b/package.json
@@ -54,7 +54,7 @@
     "pelias-logger": "^1.2.0",
     "pelias-microservice-wrapper": "^1.7.0",
     "pelias-model": "^7.0.0",
-    "pelias-parser": "1.53.0",
+    "pelias-parser": "github:pelias/parser#venue_improvements",
     "pelias-query": "^11.0.0",
     "pelias-sorting": "^1.2.0",
     "predicates": "^2.0.0",

From 756dd85fbd0860ef1372ce4ddbb27d53dc2c8a19 Mon Sep 17 00:00:00 2001
From: missinglink <insomnia@rcpt.at>
Date: Tue, 1 Sep 2020 13:59:00 +0200
Subject: [PATCH 4/5] refactor(parser): document MIN_PREFIX_CHAR_LENGTH

---
 sanitizer/_text_pelias_parser.js | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/sanitizer/_text_pelias_parser.js b/sanitizer/_text_pelias_parser.js
index 4aed45c03..f06748784 100644
--- a/sanitizer/_text_pelias_parser.js
+++ b/sanitizer/_text_pelias_parser.js
@@ -12,6 +12,15 @@ const MAX_TEXT_LENGTH = 140;
 // will simply have their entire body returned as the $subject
 const MIN_ACCEPTABLE_SCORE = 0.3;
 
+// this constant defines the minimum amount of characters that can be
+// interpreted as a $prefix when assigning them to $subject.
+// this is useful for cases such as 'St Francis' where the parser returns
+// { locality: 'Francis' }, it's not able to classify the 'St' token.
+// in this case the leading 'St' would normally be considered the $subject
+// and the parsed_text would be { subject: 'St', admin: 'Francis' }.
+// the $MIN_PREFIX_CHAR_LENGTH var is a simple way of working around this problem.
+const MIN_PREFIX_CHAR_LENGTH = 3;
+
 /**
   this module provides fulltext parsing using the pelias/parser module.
   see: https://github.com/pelias/parser
@@ -213,7 +222,7 @@ function parse (clean) {
   }
   // query with a $prefix such as a venue query
   else if (!_.isEmpty(prefix)){
-    if (prefix.length > 2) {
+    if (prefix.length >= MIN_PREFIX_CHAR_LENGTH) {
       parsed_text.subject = prefix;
     } else {
       parsed_text = { subject: normalizedBody };

From 639b3a9a76b46d1bde1721114f2b9117a497c0b7 Mon Sep 17 00:00:00 2001
From: missinglink <insomnia@rcpt.at>
Date: Thu, 3 Sep 2020 17:06:37 +0200
Subject: [PATCH 5/5] refactor(parser): improved removal of unit numbers from
 $subject and $admin

---
 sanitizer/_text_pelias_parser.js           |  5 +++--
 test/unit/sanitizer/_text_pelias_parser.js | 10 ++++++++++
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/sanitizer/_text_pelias_parser.js b/sanitizer/_text_pelias_parser.js
index f06748784..0b64852e6 100644
--- a/sanitizer/_text_pelias_parser.js
+++ b/sanitizer/_text_pelias_parser.js
@@ -137,9 +137,10 @@ function parse (clean) {
   // '    VVVV NN SSSSSSS AAAAAA PPPPP      '
   let mask = solution.mask(t);
 
-  // the entire input text as seen by the parser with any postcode classification(s) removed
+  // the entire input text as seen by the parser with any postcode and unit
+  // classification(s) removed
   let body = t.span.body.split('')
-    .map((c, i) => (mask[i] !== 'P') ? c : ' ')
+    .map((c, i) => !/[PU]/.test(mask[i]) ? c : ' ')
     .join('');
 
   // same as $body above but with consecutive whitespace squashed and trimmed.
diff --git a/test/unit/sanitizer/_text_pelias_parser.js b/test/unit/sanitizer/_text_pelias_parser.js
index e6887e5ca..68b627651 100644
--- a/test/unit/sanitizer/_text_pelias_parser.js
+++ b/test/unit/sanitizer/_text_pelias_parser.js
@@ -419,6 +419,16 @@ module.exports.tests.text_parser = function (test, common) {
   cases.push(['e8 1dn', { subject: 'e8 1dn' }, true]);
   // cases.push(['london e8 1dn', { subject: 'e8 1dn' }, true]); // issue
 
+  // unit number between address and locality
+  cases.push(['7750 Kennedy Rd #2A Markham', {
+    subject: '7750 Kennedy Rd',
+    housenumber: '7750',
+    street: 'Kennedy Rd',
+    unit: '#2A',
+    locality: 'Markham',
+    admin: 'Markham',
+  }]);
+
   cases.forEach(testcase => {
     let input = testcase[0];
     let expected = testcase[1];