I recently had a situation where I needed to show some text received in HTML format as plain text. This is the method I now use for this purpose, implemented as an extension method:
using System.Linq;
using System.Text.RegularExpressions;
namespace ExtensionMethods
{
public static class StringExtensionMethods
{
public static string StripHtml(this string text)
{
if (string.IsNullOrEmpty(text))
{
return text;
}
var tagRegex = new Regex(@"(?></?\w+)(?>(?:[^>'""]+|'[^']*'|""[^""]*"")*)>");
var tagMatches = tagRegex.Matches(text);
var commentRegex = new Regex(@"\<![ \r\n\t]*(--([^\-]|[\r\n]|-[^\-])*--[ \r\n\t]*)\>");
var commentMatches = commentRegex.Matches(text);
// Replace each tag match with an empty space:
text = tagMatches.Cast<object>().Aggregate(text, (current, match) => current.Replace(match.ToString(), " "));
// Replace each comment with an empty string:
text = commentMatches.Cast<object>()
.Aggregate(text, (current, match) => current.Replace(match.ToString(), string.Empty));
// We also need to replace as this can mess up the system:
text = text.Replace(" ", " ");
// Trim and remove all double spaces:
text = text.Trim().RemoveDoubleSpaces();
return text;
}
public static string RemoveDoubleSpaces(this string text)
{
if (string.IsNullOrEmpty(text))
{
return text;
}
// Condense all double spaces to a single space:
while (text.Contains(" "))
{
text = text.Replace(" ", " ");
}
return text;
}
}
}
The method RemoveDoubleSpaces was also needed, since after replacing HTML elements with empty space it is possible to end up with multiple empty spaces where a single space would do. This is quite a useful method in its own right, hence separating it out.
If you find any inputs which trip this method up, please let me know.