From e66bd13be4ebd27e92b4be846a18c862561dbffc Mon Sep 17 00:00:00 2001 From: missinglink Date: Tue, 1 Sep 2020 11:08:47 +0200 Subject: [PATCH 1/5] wip: parser interpretation improvements --- sanitizer/_text_pelias_parser.js | 80 ++++++++++++++++------ test/unit/sanitizer/_text_pelias_parser.js | 43 +++++++++--- 2 files changed, 93 insertions(+), 30 deletions(-) diff --git a/sanitizer/_text_pelias_parser.js b/sanitizer/_text_pelias_parser.js index 89f547d47..4aed45c03 100644 --- a/sanitizer/_text_pelias_parser.js +++ b/sanitizer/_text_pelias_parser.js @@ -7,6 +7,11 @@ const parser = new AddressParser(); const _ = require('lodash'); const MAX_TEXT_LENGTH = 140; +// this constant defines a lower boundary for the solution score returned +// by the Pelias parser. Any solutions which scored lower than this value +// will simply have their entire body returned as the $subject +const MIN_ACCEPTABLE_SCORE = 0.3; + /** this module provides fulltext parsing using the pelias/parser module. see: https://github.com/pelias/parser @@ -52,8 +57,36 @@ function _sanitize (raw, clean) { return messages; } +/** + The Pelias parser is responsible for interpreting an input string and then returning + one or more solutions for how the input tokens can been logically classified. + + However, the responsibility of *how* to interpret those solutions and how best + to map them to variables we can use to query elasticsearch is the responsibility + of the function below. + + The general idea is to split tokens along a 'cursor boundary', so on the left side of + the cursor we have the 'subject' of the query (ie. the place in question) and on the + right side of the query we have the administrative tokens. + + note: The function below is *only* concerned with selecting $parsed_text.subject and + $parsed_text.admin values + + see: `query/text_parser_pelias.js` for the logic which adds the other fields such + as $parsed_text.housenumber and $parsed_text.street etc. *before* this gets run. + + Postcodes are a special case, they belong to neither the admin hierarchy nor to the + subject itself. They are more similar in that sense to a house number, because they + are a property of the address. They are tricky because they usually appear somewhere + in the middle of the input text. They shouldn't be included in either the subject or + the admin parts, unless the postcode is itself the subject of the query. + + Here are some examples of how we might split a query into subject and admin: + "100 Foo Street Brookyln NYC" -> ["100 Foo Street", "Brookyln NYC"] + "Foo Bar 10017 Berlin Germany" -> ["Foo Bar", "Berlin Germany"] + */ function parse (clean) { - + // parse text let start = new Date(); const t = new Tokenizer(clean.text); @@ -73,7 +106,7 @@ function parse (clean) { let solution = new Solution(); if (t.solution.length) { solution = t.solution[0]; } - // 1. map the output of the parser in to parsed_text + // 1. map the output of the parser into $parsed_text let parsed_text = { subject: undefined }; solution.pair.forEach(p => { @@ -100,6 +133,9 @@ function parse (clean) { .map((c, i) => (mask[i] !== 'P') ? c : ' ') .join(''); + // same as $body above but with consecutive whitespace squashed and trimmed. + const normalizedBody = t.section.map(sp => sp.body).join(' ').replace(/\s+/g, ' ').trim(); + // scan through the input text and 'bucket' characters in to one of two buckets: // prefix: all unparsed characters that came before any parsed fields // postfix: all characters from the first admin field to the end of the string @@ -107,26 +143,21 @@ function parse (clean) { // set cursor to the first classified character from selected classes let cursor = mask.search(/[NSAP]/); - // >> solution includes venue classification - // set cursor after the venue name - if (mask.includes('V')) { cursor = mask.lastIndexOf('V') +1; } - - if (cursor === -1) { cursor = body.length; } - let prefix = _.trim(body.substr(0, cursor), ' ,'); - - // solution includes address classification + // if solution includes address classification // set cursor after the last classified address character - if (mask.search(/[NS]/) > -1) { + if ( mask.includes('N') && mask.includes('S') ) { cursor = Math.max(mask.lastIndexOf('N'), mask.lastIndexOf('S')) + 1; } - // solution includes admin classification + + // else if solution includes admin classification // set cursor to the first classified admin character else if( mask.includes('A') ){ cursor = mask.indexOf('A'); } - // >> solution includes venue classification - // set cursor after the venue name - else if (mask.includes('V')) { cursor = mask.lastIndexOf('V') + 1; } + // else set cursor to end-of-text else { cursor = body.length; } + + if (cursor === -1) { cursor = body.length; } + let prefix = _.trim(body.substr(0, cursor), ' ,'); let postfix = _.trim(body.substr(cursor), ' ,'); // clean up spacing around commas @@ -163,8 +194,13 @@ function parse (clean) { // 4. set 'subject', this is the text which will target the 'name.*' // fields in elasticsearch queries + // in the case where the solution score is very low we simply use the entire + // input as the $subject. + if ( solution.score < MIN_ACCEPTABLE_SCORE ) { + parsed_text = { subject: normalizedBody }; + } // an address query - if (!_.isEmpty(parsed_text.housenumber) && !_.isEmpty(parsed_text.street)) { + else if (!_.isEmpty(parsed_text.housenumber) && !_.isEmpty(parsed_text.street)) { parsed_text.subject = `${parsed_text.housenumber} ${parsed_text.street}`; } // an intersection query @@ -172,12 +208,16 @@ function parse (clean) { parsed_text.subject = `${parsed_text.street} & ${parsed_text.cross_street}`; } // a street query - else if (!_.isEmpty(parsed_text.street)) { + else if (!_.isEmpty(parsed_text.street) && _.isEmpty(parsed_text.venue)) { parsed_text.subject = parsed_text.street; } // query with a $prefix such as a venue query else if (!_.isEmpty(prefix)){ - parsed_text.subject = prefix; + if (prefix.length > 2) { + parsed_text.subject = prefix; + } else { + parsed_text = { subject: normalizedBody }; + } } // a postcode query else if (!_.isEmpty(parsed_text.postcode)) { @@ -225,10 +265,10 @@ function parse (clean) { } } } - + // unknown query type else { - parsed_text.subject = t.span.body; + parsed_text = { subject: normalizedBody }; } return parsed_text; diff --git a/test/unit/sanitizer/_text_pelias_parser.js b/test/unit/sanitizer/_text_pelias_parser.js index a169d5d93..8e17ac11c 100644 --- a/test/unit/sanitizer/_text_pelias_parser.js +++ b/test/unit/sanitizer/_text_pelias_parser.js @@ -20,7 +20,7 @@ module.exports.tests.text_parser = function (test, common) { }); let cases = []; - + // USA queries cases.push(['soho, new york, NY', { subject: 'soho', @@ -154,6 +154,29 @@ module.exports.tests.text_parser = function (test, common) { admin: 'Kentucky' }]); + // badly parsed venue names should at least have correct $subject + cases.push(['Kells Irish Restaurant & Pub', { + subject: 'Kells Irish Restaurant & Pub' + }, true]); + cases.push(['The Village Zombie', { + subject: 'The Village Zombie' + }, true]); + cases.push(['Chili\'s Bar & Grill', { + subject: 'Chili\'s Bar & Grill' + }, true]); + cases.push(['OYO Hotels & Homes', { + subject: 'OYO Hotels & Homes' + }, true]); + cases.push(['Adamas Pharmaceuticals', { + subject: 'Adamas Pharmaceuticals' + }, true]); + cases.push(['St Francis Extended', { + subject: 'St Francis Extended' + }, true]); + cases.push(['St Francis', { + subject: 'St Francis' + }, true]); + // street (USA style) cases.push(['M', { subject: 'M' }, true]); cases.push(['Ma', { subject: 'Ma' }, true]); @@ -282,11 +305,11 @@ module.exports.tests.text_parser = function (test, common) { cases.push(['Kasch', { subject: 'Kasch' }, true]); cases.push(['Kaschk', { subject: 'Kaschk' }, true]); cases.push(['Kaschk ', { subject: 'Kaschk' }, true]); - // cases.push(['Kaschk B', { subject: 'Kaschk' }, true]); // jitter issue - cases.push(['Kaschk Be', { subject: 'Kaschk' }, true]); - // cases.push(['Kaschk Ber', { subject: 'Kaschk' }, true]); // jitter issue - // cases.push(['Kaschk Berl', { subject: 'Kaschk' }, true]); // jitter issue - // cases.push(['Kaschk Berli', { subject: 'Kaschk' }, true]); // jitter issue + cases.push(['Kaschk B', { subject: 'Kaschk B' }, true]); + cases.push(['Kaschk Be', { subject: 'Kaschk Be' }, true]); + cases.push(['Kaschk Ber', { subject: 'Kaschk Ber' }, true]); + cases.push(['Kaschk Berl', { subject: 'Kaschk Berl' }, true]); + cases.push(['Kaschk Berli', { subject: 'Kaschk Berli' }, true]); cases.push(['Kaschk Berlin', { subject: 'Kaschk' }, true]); cases.push(['A', { subject: 'A' }, true]); @@ -308,7 +331,7 @@ module.exports.tests.text_parser = function (test, common) { // cases.push(['Air & Space Museu', { subject: 'Air & Space Museu' }, true]); // jitter issue cases.push(['Air & Space Museum', { subject: 'Air & Space Museum' }, true]); cases.push(['Air & Space Museum ', { subject: 'Air & Space Museum' }, true]); - cases.push(['Air & Space Museum D', { subject: 'Air & Space Museum' }, true]); + cases.push(['Air & Space Museum D', { subject: 'Air & Space Museum D' }, true]); cases.push(['Air & Space Museum DC', { subject: 'Air & Space Museum' }, true]); // admin areas @@ -322,7 +345,7 @@ module.exports.tests.text_parser = function (test, common) { cases.push(['New York', { subject: 'New York' }, true]); cases.push(['New York N', { subject: 'New York' }, true]); cases.push(['New York NY', { subject: 'New York' }, true]); - + cases.push(['B', { subject: 'B' }, true]); cases.push(['Be', { subject: 'Be' }, true]); cases.push(['Ber', { subject: 'Ber' }, true]); @@ -421,9 +444,9 @@ module.exports.tests.text_parser = function (test, common) { const raw = { text: ` Sometimes we make the process more complicated than we need to. -We will never make a journey of a thousand miles by fretting about +We will never make a journey of a thousand miles by fretting about how long it will take or how hard it will be. -We make the journey by taking each day step by step and then repeating +We make the journey by taking each day step by step and then repeating it again and again until we reach our destination.` }; const clean = {}; const messages = sanitizer.sanitize(raw, clean); From 1084bdf54dcc821d1870a5c4ac7be782e001cd1d Mon Sep 17 00:00:00 2001 From: missinglink Date: Tue, 1 Sep 2020 13:40:45 +0200 Subject: [PATCH 2/5] test(parser): add test case for Marks & Spencer --- test/unit/sanitizer/_text_pelias_parser.js | 40 ++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/test/unit/sanitizer/_text_pelias_parser.js b/test/unit/sanitizer/_text_pelias_parser.js index 8e17ac11c..e6887e5ca 100644 --- a/test/unit/sanitizer/_text_pelias_parser.js +++ b/test/unit/sanitizer/_text_pelias_parser.js @@ -365,6 +365,46 @@ module.exports.tests.text_parser = function (test, common) { cases.push(['Berlin Deutschlan', { subject: 'Berlin' }, true]); cases.push(['Berlin Deutschland', { subject: 'Berlin' }, true]); + // venue name with ampersand + // note: this query is ambigious as it could refer to either the + // UK high street brand "Marks & Spencer" or an intersection of + // Marks St and Spencer Ave. + // note: what we're looking for here is that we are using the whole + // input as the $subject regardless and without jitter. + cases.push(['M', { subject: 'M' }, true]); + cases.push(['Ma', { subject: 'Ma' }, true]); + cases.push(['Mar', { subject: 'Mar' }, true]); + cases.push(['Mark', { subject: 'Mark' }, true]); + + // note: for the following 5 keystrokes the $subject is simplified to 'Marks' + cases.push(['Marks', { subject: 'Marks' }, true]); + cases.push(['Marks ', { subject: 'Marks' }, true]); + cases.push(['Marks &', { subject: 'Marks' }, true]); + cases.push(['Marks & ', { subject: 'Marks' }, true]); + cases.push(['Marks & S', { subject: 'Marks' }, true]); + + cases.push(['Marks & Sp', { subject: 'Marks & Sp' }, true]); + cases.push(['Marks & Spe', { subject: 'Marks & Spe' }, true]); + cases.push(['Marks & Spen', { subject: 'Marks & Spen' }, true]); + cases.push(['Marks & Spenc', { subject: 'Marks & Spenc' }, true]); + cases.push(['Marks & Spence', { subject: 'Marks & Spence' }, true]); + cases.push(['Marks & Spencer', { subject: 'Marks & Spencer' }, true]); + + // venue is also known colloquially as "M AND S" + cases.push(['M', { subject: 'M' }, true]); + cases.push(['M ', { subject: 'M' }, true]); + cases.push(['M &', { subject: 'M &' }, true]); + cases.push(['M & ', { subject: 'M &' }, true]); + cases.push(['M & S', { subject: 'M & S' }, true]); + + cases.push(['M', { subject: 'M' }, true]); + cases.push(['M ', { subject: 'M' }, true]); + cases.push(['M a', { subject: 'M a' }, true]); + cases.push(['M an', { subject: 'M an' }, true]); + cases.push(['M and', { subject: 'M and' }, true]); + cases.push(['M and ', { subject: 'M and' }, true]); + cases.push(['M and S', { subject: 'M & S' }, true]); + // postcodes cases.push(['2000', { subject: '2000' }, true]); cases.push(['Sydney 2000', { subject: '2000' }, true]); From 24b87d3e757b09546517361362911ed3ae0c4cd6 Mon Sep 17 00:00:00 2001 From: missinglink Date: Tue, 1 Sep 2020 13:43:07 +0200 Subject: [PATCH 3/5] package: pin dev dependency --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index 73f60981b..f30c12ad3 100644 --- a/package.json +++ b/package.json @@ -54,7 +54,7 @@ "pelias-logger": "^1.2.0", "pelias-microservice-wrapper": "^1.7.0", "pelias-model": "^7.0.0", - "pelias-parser": "1.53.0", + "pelias-parser": "github:pelias/parser#venue_improvements", "pelias-query": "^11.0.0", "pelias-sorting": "^1.2.0", "predicates": "^2.0.0", From 756dd85fbd0860ef1372ce4ddbb27d53dc2c8a19 Mon Sep 17 00:00:00 2001 From: missinglink Date: Tue, 1 Sep 2020 13:59:00 +0200 Subject: [PATCH 4/5] refactor(parser): document MIN_PREFIX_CHAR_LENGTH --- sanitizer/_text_pelias_parser.js | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/sanitizer/_text_pelias_parser.js b/sanitizer/_text_pelias_parser.js index 4aed45c03..f06748784 100644 --- a/sanitizer/_text_pelias_parser.js +++ b/sanitizer/_text_pelias_parser.js @@ -12,6 +12,15 @@ const MAX_TEXT_LENGTH = 140; // will simply have their entire body returned as the $subject const MIN_ACCEPTABLE_SCORE = 0.3; +// this constant defines the minimum amount of characters that can be +// interpreted as a $prefix when assigning them to $subject. +// this is useful for cases such as 'St Francis' where the parser returns +// { locality: 'Francis' }, it's not able to classify the 'St' token. +// in this case the leading 'St' would normally be considered the $subject +// and the parsed_text would be { subject: 'St', admin: 'Francis' }. +// the $MIN_PREFIX_CHAR_LENGTH var is a simple way of working around this problem. +const MIN_PREFIX_CHAR_LENGTH = 3; + /** this module provides fulltext parsing using the pelias/parser module. see: https://github.com/pelias/parser @@ -213,7 +222,7 @@ function parse (clean) { } // query with a $prefix such as a venue query else if (!_.isEmpty(prefix)){ - if (prefix.length > 2) { + if (prefix.length >= MIN_PREFIX_CHAR_LENGTH) { parsed_text.subject = prefix; } else { parsed_text = { subject: normalizedBody }; From 639b3a9a76b46d1bde1721114f2b9117a497c0b7 Mon Sep 17 00:00:00 2001 From: missinglink Date: Thu, 3 Sep 2020 17:06:37 +0200 Subject: [PATCH 5/5] refactor(parser): improved removal of unit numbers from $subject and $admin --- sanitizer/_text_pelias_parser.js | 5 +++-- test/unit/sanitizer/_text_pelias_parser.js | 10 ++++++++++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/sanitizer/_text_pelias_parser.js b/sanitizer/_text_pelias_parser.js index f06748784..0b64852e6 100644 --- a/sanitizer/_text_pelias_parser.js +++ b/sanitizer/_text_pelias_parser.js @@ -137,9 +137,10 @@ function parse (clean) { // ' VVVV NN SSSSSSS AAAAAA PPPPP ' let mask = solution.mask(t); - // the entire input text as seen by the parser with any postcode classification(s) removed + // the entire input text as seen by the parser with any postcode and unit + // classification(s) removed let body = t.span.body.split('') - .map((c, i) => (mask[i] !== 'P') ? c : ' ') + .map((c, i) => !/[PU]/.test(mask[i]) ? c : ' ') .join(''); // same as $body above but with consecutive whitespace squashed and trimmed. diff --git a/test/unit/sanitizer/_text_pelias_parser.js b/test/unit/sanitizer/_text_pelias_parser.js index e6887e5ca..68b627651 100644 --- a/test/unit/sanitizer/_text_pelias_parser.js +++ b/test/unit/sanitizer/_text_pelias_parser.js @@ -419,6 +419,16 @@ module.exports.tests.text_parser = function (test, common) { cases.push(['e8 1dn', { subject: 'e8 1dn' }, true]); // cases.push(['london e8 1dn', { subject: 'e8 1dn' }, true]); // issue + // unit number between address and locality + cases.push(['7750 Kennedy Rd #2A Markham', { + subject: '7750 Kennedy Rd', + housenumber: '7750', + street: 'Kennedy Rd', + unit: '#2A', + locality: 'Markham', + admin: 'Markham', + }]); + cases.forEach(testcase => { let input = testcase[0]; let expected = testcase[1];