generation/: Ignore files which are not unicode

When attempting to determine language based on hashbang, files which can not be opened as Unicode should be ignored. Also merge the hashbang detection code and add coverage of split_by_language. Fixes coala#293
abhaygupta97 · Oct 13, 2018 · 5c26e67 · 5c26e67
1 parent 7bd5689
commit 5c26e67
Show file tree

Hide file tree

Showing 5 changed files with 135 additions and 24 deletions.
diff --git a/.nocover.yaml b/.nocover.yaml
@@ -45,4 +45,3 @@ nocover_regexes:
   - def is_glob_exp
   - def get_gitignore_glob
   - def parse_gitignore_line
-  - def get_language_from_hashbang
diff --git a/coala_quickstart/generation/Project.py b/coala_quickstart/generation/Project.py
@@ -5,10 +5,12 @@
 
 from coala_utils.string_processing.StringConverter import StringConverter
 from coala_utils.Extensions import exts
-from coala_quickstart.generation.Utilities import get_language_from_hashbang
+from coala_quickstart.generation.Utilities import (
+    get_hashbang,
+    get_language_from_hashbang,
+)
 from coala_quickstart.Constants import (
     ASK_TO_SELECT_LANG,
-    HASHBANG_REGEX,
     )
 
 
@@ -56,15 +58,18 @@ def language_percentage(file_paths):
             for lang in exts[ext]:
                 results[lang] += delta
 
-        elif os.path.exists(file_path):
-            with open(file_path, 'r') as data:
-                hashbang = data.readline()
-                if re.match(HASHBANG_REGEX, hashbang):
-                    language = get_language_from_hashbang(hashbang).lower()
-                    for ext in exts:
-                        for lang in exts[ext]:
-                            if language == lang.lower():
-                                results[lang.lower()] += delta
+            continue
+
+        hashbang = get_hashbang(file_path)
+
+        if not hashbang:
+            continue
+
+        language = get_language_from_hashbang(hashbang).lower()
+        for ext in exts:
+            for lang in exts[ext]:
+                if language == lang.lower():
+                    results[lang.lower()] += delta
 
     return results
 

diff --git a/coala_quickstart/generation/Utilities.py b/coala_quickstart/generation/Utilities.py
@@ -99,16 +99,21 @@ def split_by_language(project_files):
             for lang in exts[ext]:
                 lang_files[lang.lower()].add(file)
                 lang_files['all'].add(file)
-        else:  # pragma: nocover
-            with open(file, 'r') as data:
-                hashbang = data.readline()
-                if(re.match(HASHBANG_REGEX, hashbang)):
-                    language = get_language_from_hashbang(hashbang).lower()
-                    for ext in exts:
-                        for lang in exts[ext]:
-                            if language == lang.lower():
-                                lang_files[lang.lower()].add(file)
-                                lang_files['all'].add(file)
+
+            continue
+
+        hashbang = get_hashbang(file)
+
+        if not hashbang:
+            continue
+
+        language = get_language_from_hashbang(hashbang).lower()
+        for ext in exts:
+            for lang in exts[ext]:
+                if language == lang.lower():
+                    lang_files[lang.lower()].add(file)
+                    lang_files['all'].add(file)
+
     return lang_files
 
 
@@ -179,8 +184,29 @@ def search_for_orig(decorated, orig_name):
                 return found
 
 
+def get_hashbang(file_path):
+    if not os.path.exists(file_path):
+        return
+
+    try:
+        with open(file_path, 'r') as data:
+            hashbang = data.readline()
+    except UnicodeDecodeError:  # pragma nt: no cover
+        return
+
+    hashbang = hashbang.strip()
+    if not hashbang:
+        return
+
+    if not re.match(HASHBANG_REGEX, hashbang):
+        return
+
+    return hashbang
+
+
 def get_language_from_hashbang(hashbang):
-    if(re.match('(^#!(.*))', hashbang)):
+    assert hashbang
+    if hashbang:  # pragma: no branch
         hashbang_contents = hashbang.split(' ')
         try:
             # For eg: #!bin/bash python3

diff --git a/setup.cfg b/setup.cfg
@@ -90,6 +90,5 @@ exclude_lines =
   def is_glob_exp
   def get_gitignore_glob
   def parse_gitignore_line
-  def get_language_from_hashbang
 
 [coverage:force_end_of_section]
diff --git a/tests/generation/UtilitiesTest.py b/tests/generation/UtilitiesTest.py
@@ -3,12 +3,16 @@
 import types
 import unittest
 
+from tempfile import NamedTemporaryFile
+
 from tests.test_bears.AllKindsOfSettingsDependentBear import (
     AllKindsOfSettingsDependentBear)
 from coala_quickstart.generation.Utilities import (
     contained_in,
+    get_hashbang,
     get_default_args, get_all_args,
     search_for_orig, concatenate, peek,
+    split_by_language,
     get_language_from_hashbang)
 from coalib.results.SourcePosition import SourcePosition
 from coalib.results.SourceRange import SourceRange
@@ -73,11 +77,89 @@ def test_get_all_args(self):
                           'no_chars': 79,
                           'chars': False, 'dependency_results': {}})
 
+
+class TestHashBang(unittest.TestCase):
+
+    def test_missing_file(self):
+        self.assertIsNone(get_hashbang('does_not_exist'))
+
+    def test_with_bash(self):
+        with NamedTemporaryFile(mode='w+t', delete=False) as temp_file:
+            temp_file.write('#!bin/bash\n')
+            temp_file.close()
+            self.assertEqual(get_hashbang(temp_file.name), '#!bin/bash')
+
+    def test_no_eol(self):
+        with NamedTemporaryFile(mode='w+t', delete=False) as temp_file:
+            temp_file.write('#!bin/bash')
+            temp_file.close()
+            self.assertEqual(get_hashbang(temp_file.name), '#!bin/bash')
+
+    def test_with_slash(self):
+        with NamedTemporaryFile(mode='w+t', delete=False) as temp_file:
+            temp_file.write('#!/bin/bash\n')
+            temp_file.close()
+            self.assertEqual(get_hashbang(temp_file.name), '#!/bin/bash')
+
+    def test_with_space(self):
+        with NamedTemporaryFile(mode='w+t', delete=False) as temp_file:
+            temp_file.write('#!/bin/bash \n')
+            temp_file.close()
+            self.assertEqual(get_hashbang(temp_file.name), '#!/bin/bash')
+
+    def test_env(self):
+        with NamedTemporaryFile(mode='w+t', delete=False) as temp_file:
+            temp_file.write('#!/bin/env bash\n')
+            temp_file.close()
+            self.assertEqual(get_hashbang(temp_file.name), '#!/bin/env bash')
+
+    def test_non_unicode_file(self):
+        with NamedTemporaryFile(mode='w+b', delete=False) as temp_file:
+            temp_file.write(b'\2000x80')
+            temp_file.close()
+            self.assertIsNone(get_hashbang(temp_file.name))
+
+    def test_empty_file(self):
+        with NamedTemporaryFile(mode='w+t', delete=False) as temp_file:
+            temp_file.write('\n')
+            temp_file.close()
+            self.assertIsNone(get_hashbang(temp_file.name))
+
+    def test_no_bang(self):
+        with NamedTemporaryFile(mode='w+t', delete=False) as temp_file:
+            temp_file.write('#bin/bash')
+            temp_file.close()
+            self.assertIsNone(get_hashbang(temp_file.name))
+
+    def test_no_hash(self):
+        with NamedTemporaryFile(mode='w+t', delete=False) as temp_file:
+            temp_file.write('!bin/bash')
+            temp_file.close()
+            self.assertIsNone(get_hashbang(temp_file.name))
+
     def test_get_language_from_hashbang(self):
         self.assertEqual(get_language_from_hashbang('#!/usr/bin/env python'),
                          'python')
         self.assertEqual(get_language_from_hashbang('#!bin/bash'),
                          'bash')
+        self.assertEqual(get_language_from_hashbang('#!/bin/bash'),
+                         'bash')
+
+    def test_split_by_language(self):
+        with NamedTemporaryFile(delete=False, suffix='.py') as temp_file1, \
+                NamedTemporaryFile(delete=False, suffix='.txt') as temp_file2, \
+                NamedTemporaryFile(delete=False, suffix='.txt') as temp_file3:
+            temp_file3.write(b'#!bin/python')
+            temp_file3.close()
+            langs = split_by_language(
+                [temp_file1.name, temp_file2.name, temp_file3.name])
+            self.assertCountEqual(
+                langs,
+                {
+                    'all': [temp_file1.name, temp_file3.name],
+                    'python': [temp_file1.name, temp_file3.name],
+                }
+            )
 
 
 class TestDataStructuresOperationsFunctions(unittest.TestCase):