Convert HTML to Plain Text
,
Strip out HTML tags while preserving the basic formatting
Introduction
This article provides the procedure for stripping out HTML tags while preserving most basic formatting. In other words, it converts HTML to plain text.Background
This example heavily relies on regular expressions, in particular System.Text.RegularExpressions.Regex.Replace()method. You may also find this reference on regular expressions syntax useful.Using the Code
The code usesSystem.Text.RegularExpressions namespace
and consists of a single function,StripHTML()
.First, the development formatting is removed such as tabs used for step-identations and repeated whitespaces. As a result, the input HTML is "flattened" into one continuous string. This serves two reasons:
- To remove the formatting ignored by browsers
- To make the regexes work reliably (they seem to get confused by escape characters)
<head>
and </head>
tags.Then, all scripts are removed by chopping out anything between
<script>
and </script>
tags inclusive. Similarly with styles.Then the basic formatting tags, such as
<BR>
and <DIV>
are replaced with \r
or \r\r
. Also <TR>
tags are replaced by line breaks and <TD>
s by tabs.<LI>
s are replaced by *s
and special characters such as
are replaced with their corresponding values.Finally all the remaining tags are replaced with empty strings.
By this stage, there are likely to be a lot of redundant repeating line breaks and tabs. Any sequence over 2 line breaks long is replaced by two line breaks. Similarly with tabs: sequences over 4 tabs long are replaced by 4 tabs.
Hide Shrink Copy Code
private string StripHTML(string source)
{
try
{
string result;
// Remove HTML Development formatting
// Replace line breaks with space
// because browsers inserts space
result = source.Replace("\r", " ");
// Replace line breaks with space
// because browsers inserts space
result = result.Replace("\n", " ");
// Remove step-formatting
result = result.Replace("\t", string.Empty);
// Remove repeating spaces because browsers ignore them
result = System.Text.RegularExpressions.Regex.Replace(result,
@"( )+", " ");
// Remove the header (prepare first by clearing attributes)
result = System.Text.RegularExpressions.Regex.Replace(result,
@"<( )*head([^>])*>","<head>",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"(<( )*(/)( )*head( )*>)","</head>",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
"(<head>).*(</head>)",string.Empty,
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// remove all scripts (prepare first by clearing attributes)
result = System.Text.RegularExpressions.Regex.Replace(result,
@"<( )*script([^>])*>","<script>",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"(<( )*(/)( )*script( )*>)","</script>",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
//result = System.Text.RegularExpressions.Regex.Replace(result,
// @"(<script>)([^(<script>\.</script>)])*(</script>)",
// string.Empty,
// System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"(<script>).*(</script>)",string.Empty,
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// remove all styles (prepare first by clearing attributes)
result = System.Text.RegularExpressions.Regex.Replace(result,
@"<( )*style([^>])*>","<style>",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"(<( )*(/)( )*style( )*>)","</style>",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
"(<style>).*(</style>)",string.Empty,
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// insert tabs in spaces of <td> tags
result = System.Text.RegularExpressions.Regex.Replace(result,
@"<( )*td([^>])*>","\t",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// insert line breaks in places of <BR> and <LI> tags
result = System.Text.RegularExpressions.Regex.Replace(result,
@"<( )*br( )*>","\r",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"<( )*li( )*>","\r",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// insert line paragraphs (double line breaks) in place
// if <P>, <DIV> and <TR> tags
result = System.Text.RegularExpressions.Regex.Replace(result,
@"<( )*div([^>])*>","\r\r",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"<( )*tr([^>])*>","\r\r",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"<( )*p([^>])*>","\r\r",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// Remove remaining tags like <a>, links, images,
// comments etc - anything that's enclosed inside < >
result = System.Text.RegularExpressions.Regex.Replace(result,
@"<[^>]*>",string.Empty,
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// replace special characters:
result = System.Text.RegularExpressions.Regex.Replace(result,
@" "," ",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"•"," * ",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"‹","<",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"›",">",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"™","(tm)",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"⁄","/",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"<","<",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@">",">",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"©","(c)",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"®","(r)",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// Remove all others. More can be added, see
// http://hotwired.lycos.com/webmonkey/reference/special_characters/
result = System.Text.RegularExpressions.Regex.Replace(result,
@"&(.{2,6});", string.Empty,
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// for testing
//System.Text.RegularExpressions.Regex.Replace(result,
// this.txtRegex.Text,string.Empty,
// System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// make line breaking consistent
result = result.Replace("\n", "\r");
// Remove extra line breaks and tabs:
// replace over 2 breaks with 2 and over 4 tabs with 4.
// Prepare first to remove any whitespaces in between
// the escaped characters and remove redundant tabs in between line breaks
result = System.Text.RegularExpressions.Regex.Replace(result,
"(\r)( )+(\r)","\r\r",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
"(\t)( )+(\t)","\t\t",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
"(\t)( )+(\r)","\t\r",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
"(\r)( )+(\t)","\r\t",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// Remove redundant tabs
result = System.Text.RegularExpressions.Regex.Replace(result,
"(\r)(\t)+(\r)","\r\r",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// Remove multiple tabs following a line break with just one tab
result = System.Text.RegularExpressions.Regex.Replace(result,
"(\r)(\t)+","\r\t",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// Initial replacement target string for line breaks
string breaks = "\r\r\r";
// Initial replacement target string for tabs
string tabs = "\t\t\t\t\t";
for (int index=0; index<result.Length; index++)
{
result = result.Replace(breaks, "\r\r");
result = result.Replace(tabs, "\t\t\t\t");
breaks = breaks + "\r";
tabs = tabs + "\t";
}
// That's it.
return result;
}
catch
{
MessageBox.Show("Error");
return source;
}
}
Points of Interest
Escape characters such as\n
and \r
had to be removed first because they cause regexes to cease working as expected.Moreover, to make the result string display correctly in the textbox, one might need to split it up and set
textbox
's Lines
property instead of assigning to Text
property.
Hide Copy Code
this.txtResult.Lines =
StripHTML(this.txtSource.Text).Split("\r".ToCharArray());
No comments:
Post a Comment