+ uses unicode's likely subtags to find the best country flag matchin…

…g a given language ; fixes #8 + stripping strings in a few places to remove possible empty new lines + logging a few exceptions just in case
nicolabs · May 15, 2020 · c409679 · c409679
1 parent f4464aa
commit c409679
Show file tree

Hide file tree

Showing 2 changed files with 1,943 additions and 7 deletions.
diff --git a/nicobot/transbot.py b/nicobot/transbot.py
@@ -19,6 +19,7 @@
 # Provides an easy way to get the unicode sequence for country flags
 import flag
 import yaml
+import urllib.request
 
 # Own classes
 from helpers import *
@@ -37,7 +38,14 @@
 # Set to something > 0 to limit the number of translations for the keywords (for tests)
 LIMIT_KEYWORDS = None
 
-# Default (empty actually) configuration, to ease depth navigation
+# See https://github.com/nicolabs/nicobot/issues/8
+# Description : https://unicode.org/reports/tr35/#Likely_Subtags
+# Original XML version : http://cldr.unicode.org/index/cldr-spec/language-tag-equivalences
+# This is the URL to the JSON version
+LIKELY_SUBTAGS_URL = "https://raw.githubusercontent.com/unicode-cldr/cldr-core/master/supplemental/likelySubtags.json"
+
+
+# Default configuration (some defaults still need to be set up after command line has been parsed)
 class Config:
 
     def __init__(self):
@@ -53,6 +61,7 @@ def __init__(self):
             'keywords_files': [],
             'languages': [],
             'languages_file': None,
+            'languages_likely': None,
             # e.g. locale.getlocale() may return ('en_US','UTF-8') : we only keep the 'en_US' part here (the same as the expected command-line parameter)
             'locale': locale.getlocale()[0],
             'recipient': None,
@@ -85,13 +94,15 @@ class TransBot(Bot):
     def __init__( self,
         chatter, ibmcloud_url, ibmcloud_apikey,
         keywords=[], keywords_files=[],
-        languages=[], languages_file=None, locale=re.split(r'[_-]',locale.getlocale()[0]),
+        languages=[], languages_file=None, languages_likely=None,
+        locale=re.split(r'[_-]',locale.getlocale()[0]),
         shutdown_pattern=r'bye nicobot' ):
         """
             keywords: list of keywords that will trigger this bot (in any supported language)
             keywords_files: list of JSON files with each a list of keywords (or write into)
             languages: List of supported languages in this format : https://cloud.ibm.com/apidocs/language-translator#list-identifiable-languages
             languages_file: JSON file where to find the list of target languages (or write into)
+            languages_likely: JSON URI where to find Unicode's likely subtags (or write into)
             locale: overrides the default locale ; tuple like : ('en','GB')
             shutdown_pattern: a regular expression pattern that terminates this bot
             chatter: the backend chat engine
@@ -112,6 +123,8 @@ def __init__( self,
         # How many different languages to try to translate to
         self.tries = 5
 
+        self.likelyLanguages = self.loadLikelyLanguages(languages_likely)
+
         # After self.languages has been set, we can iterate over it to translate keywords
         kws = self.loadKeywords( keywords=keywords, files=keywords_files, limit=LIMIT_KEYWORDS )
         # And build a regular expression pattern with all keywords and their translations
@@ -172,7 +185,7 @@ def loadLanguages( self, force=False, file=None, locale='en' ):
                 # so the output list will always be the same size as the input one
                 t = 0
                 for language in languages:
-                    language['name'] = translations['translations'][t]['translation']
+                    language['name'] = translations['translations'][t]['translation'].strip()
                     t = t + 1
 
             # Save it for the next time
@@ -230,7 +243,7 @@ def loadKeywords( self, keywords=[], files=[], limit=None ):
                     translation = self.translate( [keyword], target=lang['language'] )
                     if translation:
                         for t in translation['translations']:
-                            translated = t['translation'].rstrip()
+                            translated = t['translation'].strip()
                             logging.debug("Adding translation %s in %s for %s", t, lang, keyword)
                             kws = kws + [ translated ]
                 except:
@@ -253,6 +266,31 @@ def loadKeywords( self, keywords=[], files=[], limit=None ):
         return kws
 
 
+    def loadLikelyLanguages( self, file ):
+        """
+            Returns a dict from a Likely Subtags JSON structure in the given file.
+            If the file cannot be read, will download it from LIKELY_SUBTAGS_URL and save it with the given filename.
+        """
+
+        try:
+            logging.debug("Loading likely languages from %s",file)
+            with open(file,'r') as f:
+                return json.load(f)
+        except:
+            logging.debug("Downloading likely subtags from %s",LIKELY_SUBTAGS_URL)
+            with urllib.request.urlopen(LIKELY_SUBTAGS_URL) as response:
+                likelySubtags = response.read()
+                logging.log(TRACE,"Got likely subtags : %s",repr(likelySubtags))
+                # Saves it for the next time
+                try:
+                    logging.debug("Saving likely subtags into %s",file)
+                    with open(file,'w') as f:
+                        f.write(likelySubtags.decode())
+                except:
+                    logging.exception("Error saving the likely languages into %s",repr(file))
+                return json.loads(likelySubtags)
+
+
     def translate( self, messages, target, source=None ):
         """
             Translates a given list of messages.
@@ -289,6 +327,28 @@ def translate( self, messages, target, source=None ):
             r.raise_for_status()
 
 
+    def languageToCountry( self, lang ):
+        """
+            Returns the most likely ISO 3361 country code from an (~ISO 639 or IBM-custom) language
+            or the given 'lang' if no country code could be identified.
+
+            lang : the language returned by IBM Translator service (is it ISO 639 ?)
+
+            See https://github.com/nicolabs/nicobot/issues/8
+            Likely subtags explanation and format :
+            - https://unicode.org/reports/tr35/#Likely_Subtags
+            - http://cldr.unicode.org/index/cldr-spec/language-tag-equivalences
+        """
+        try:
+            aa_Bbbb_CC = self.likelyLanguages['supplemental']['likelySubtags'][lang]
+            logging.log(TRACE,"Found likely subtags %s for language %s",aa_Bbbb_CC,lang)
+            # The last part is the ISO 3361 country code
+            return re.split( r'[_-]', aa_Bbbb_CC )[-1]
+        except:
+            logging.warning("Could not find a country code for %s : returning itself",lang, exc_info=True)
+            return lang
+
+
     def formatTranslation( self, translation, target ):
         """
             Common decoration of translated messages
@@ -297,11 +357,13 @@ def formatTranslation( self, translation, target ):
             target = reminder of which target language was asked (does not appear in the response of translate())
         """
 
-        text = translation['translations'][0]['translation']
+        text = translation['translations'][0]['translation'].strip()
         try:
             # Note : translation['detected_language'] is the detected source language, if guessed
-            lang_emoji = flag.flag(target)
+            country = self.languageToCountry(target)
+            lang_emoji = flag.flag(country)
         except ValueError:
+            logging.debug("Error looking for flag %s",target,exc_info=True)
             lang_emoji= "🏳️‍🌈"
         answer = "%s %s" % (text,lang_emoji)
         return i18n.t('all_messages',message=answer)
@@ -473,6 +535,7 @@ def run( self ):
     parser.add_argument("--keywords-file", dest="keywords_files", action="append", help="File to load from and write keywords to")
     parser.add_argument('--locale', '-l', dest='locale', default=config.locale, help="Change default locale (e.g. 'fr_FR')")
     parser.add_argument("--languages-file", dest="languages_file", help="File to load from and write languages to")
+    parser.add_argument("--languages-likely", dest="languages_likely", default=config.languages_likely, help="URI to Unicode's Likely Subtags (best language <-> country matches) in JSON format")
     parser.add_argument("--shutdown", dest="shutdown", help="Shutdown keyword regular expression pattern")
     parser.add_argument("--ibmcloud-url", dest="ibmcloud_url", help="IBM Cloud API base URL (get it from your resource https://cloud.ibm.com/resources)")
     parser.add_argument("--ibmcloud-apikey", dest="ibmcloud_apikey", help="IBM Cloud API key (get it from your resource : https://cloud.ibm.com/resources)")
@@ -597,6 +660,13 @@ def run( self ):
     if not config.languages_file:
         raise ValueError("Missing language file : please use only --languages-file to generate it automatically or --language for each target language")
 
+    # Finds a "likely language" file
+    config.languages_likely = filter_files([
+        config.languages_likely,
+        os.path.join( config.config_dir, 'likelySubtags.json' ) ],
+        should_exist=True,
+        fallback_to=1 )[0]
+
     # Creates the chat engine depending on the 'backend' parameter
     if config.backend == "signal":
         if not config.signal_cli:
@@ -624,7 +694,7 @@ def run( self ):
 
     TransBot(
         keywords=config.keywords, keywords_files=config.keywords_files,
-        languages_file=config.languages_file,
+        languages_file=config.languages_file, languages_likely=config.languages_likely,
         locale=lang,
         ibmcloud_url=config.ibmcloud_url, ibmcloud_apikey=config.ibmcloud_apikey,
         shutdown_pattern=config.shutdown,