How to remove commonly occuring English words from a string
20 Dec 2007I’m using this function to filter common words out of a search query.
protected string removeCommonWords(string sourceStr) { string[] seperator = { " " }; string[] ignoreWords = { "a", "all", "am", "an", "and", "any", "are", "as", "at", "be", "but", "can", "did", "do", "does", "for", "from", "had", "has", "have", "here", "how", "i", "if", "in", "is", "it", "no", "not", "of", "on", "or", "so", "that", "the", "then", "there", "this", "to", "too", "up", "use", "what", "when", "where", "who", "why", "you" }; string[] outputStr = { }; outputStr = sourceStr.ToLower().Split(seperator, StringSplitOptions.RemoveEmptyEntries); foreach (string unwantedWord in ignoreWords) { int index = Array.IndexOf(outputStr, unwantedWord); if (index != -1) { string[] copyStrArr = new string[outputStr.Length - 1]; // copy the elements before the found index for (int i = 0; i < index; i++) { copyStrArr[i] = outputStr[i]; } // copy the elements after the found index for (int i = index; i < copyStrArr.Length; i++) { copyStrArr[i] = outputStr[i + 1]; } outputStr = copyStrArr; } } sourceStr = string.Join(" ", outputStr); return sourceStr; }
Let me know if you guys have a better solution.