Skip to content

Commit

Permalink
Fixes processing bugs
Browse files Browse the repository at this point in the history
  • Loading branch information
syfantid committed Jun 1, 2016
1 parent 9cc15e1 commit 4e7719a
Showing 1 changed file with 14 additions and 19 deletions.
33 changes: 14 additions & 19 deletions src/combiner/Preprocessor.java
Original file line number Diff line number Diff line change
Expand Up @@ -64,15 +64,15 @@ public static void preprocessTweets(String collectionName, FrequencyCounter fc)
* @return The parsed tweet text
*/
private static String preprocessTweet(String input){
String[] temp = tokenizer(input);
String output = "";
for(String s : temp){
if(isWhitelisted(s) && !isMention(s, '@')){
output = output + " ";
output = output + s;
input = prepareText(input);
String[] tokens = tokenizer(input);
StringBuilder builder = new StringBuilder("");
for(String s : tokens){
if(isWhitelisted(s) && !isMention(s, '@')){ //if the word is whitelisted and its not a 'Google Plus' mention
builder.append(" ").append(s);
}
}
return finalizeText(output);
return input;
}

/**
Expand All @@ -81,14 +81,15 @@ private static String preprocessTweet(String input){
* @return The parsed comment text
*/
private static String preprocessComment(String input){
input = prepareText(input);
String[] tokens = tokenizer(input); //tokenizes the input string
StringBuilder builder = new StringBuilder("");
for(String s : tokens){
if(isWhitelisted(s) && !isMention(s, '+')){ //if the word is whitelisted and its not a 'Google Plus' mention
builder.append(" ").append(s);
}
}
return finalizeText(builder.toString());
return input;
}

/**
Expand Down Expand Up @@ -167,11 +168,7 @@ private static String toLowerCase(String input){
* @return The strig without punctuation
*/
private static String removePunctuation(String input){
input = input.replaceAll("[.,:;()\\[\\]{}?_\\-!\'*\"@#$%^&+=|~`><]+", " ");
input = input.replaceAll("\\s+"," ");
input = input.trim();

return input;
return input.replaceAll("\\W", " ");
}

/**
Expand All @@ -180,22 +177,20 @@ private static String removePunctuation(String input){
* @return The string without punctuation
*/
private static String removeSingleCharacter(String input){
input = input.replaceAll("\\s[a-z]\\s", " ");
input = input.replaceAll("\\s+"," ");
input = input.trim();

return input;
return input.replaceAll("\\b[a-z]\\b", " ");
}

/**
* Removes useless characters like punctuation and single characters and transforms to lowercase
* @param input The string to be checked
* @return The string finalized
*/
private static String finalizeText(String input){
private static String prepareText(String input){
input = toLowerCase(input);
input = removePunctuation(input);
input = removeSingleCharacter(input);
input = input.replaceAll("\\s+"," ");
input = input.trim();

return input;
}
Expand Down

0 comments on commit 4e7719a

Please sign in to comment.