I recently had a situation where I needed to show some text received in HTML format as plain text. This is the method I now use for this purpose, implemented as an extension method:
using System.Linq; using System.Text.RegularExpressions; namespace ExtensionMethods { public static class StringExtensionMethods { public static string StripHtml(this string text) { if (string.IsNullOrEmpty(text)) { return text; } var tagRegex = new Regex(@"(?></?\w+)(?>(?:[^>'""]+|'[^']*'|""[^""]*"")*)>"); var tagMatches = tagRegex.Matches(text); var commentRegex = new Regex(@"\<![ \r\n\t]*(--([^\-]|[\r\n]|-[^\-])*--[ \r\n\t]*)\>"); var commentMatches = commentRegex.Matches(text); // Replace each tag match with an empty space: text = tagMatches.Cast<object>().Aggregate(text, (current, match) => current.Replace(match.ToString(), " ")); // Replace each comment with an empty string: text = commentMatches.Cast<object>() .Aggregate(text, (current, match) => current.Replace(match.ToString(), string.Empty)); // We also need to replace as this can mess up the system: text = text.Replace(" ", " "); // Trim and remove all double spaces: text = text.Trim().RemoveDoubleSpaces(); return text; } public static string RemoveDoubleSpaces(this string text) { if (string.IsNullOrEmpty(text)) { return text; } // Condense all double spaces to a single space: while (text.Contains(" ")) { text = text.Replace(" ", " "); } return text; } } }
The method RemoveDoubleSpaces was also needed, since after replacing HTML elements with empty space it is possible to end up with multiple empty spaces where a single space would do. This is quite a useful method in its own right, hence separating it out.
If you find any inputs which trip this method up, please let me know.