本文实例讲述了C#实现将HTML转换成纯文本的方法。分享给大家供大家参考。具体如下:
使用方法:
C#代码如下:
/// <summary> /// Converts HTML to plain text. /// </summary> class HtmlToText { // Static data tables protected static Dictionary<string, string> _tags; protected static HashSet<string> _ignoreTags; // Instance variables protected TextBuilder _text; protected string _html; protected int _pos; // Static constructor (one time only) static HtmlToText() { _tags = new Dictionary<string, string>(); _tags.Add(\"address\", \"\\n\"); _tags.Add(\"blockquote\", \"\\n\"); _tags.Add(\"div\", \"\\n\"); _tags.Add(\"dl\", \"\\n\"); _tags.Add(\"fieldset\", \"\\n\"); _tags.Add(\"form\", \"\\n\"); _tags.Add(\"h1\", \"\\n\"); _tags.Add(\"/h1\", \"\\n\"); _tags.Add(\"h2\", \"\\n\"); _tags.Add(\"/h2\", \"\\n\"); _tags.Add(\"h3\", \"\\n\"); _tags.Add(\"/h3\", \"\\n\"); _tags.Add(\"h4\", \"\\n\"); _tags.Add(\"/h4\", \"\\n\"); _tags.Add(\"h5\", \"\\n\"); _tags.Add(\"/h5\", \"\\n\"); _tags.Add(\"h6\", \"\\n\"); _tags.Add(\"/h6\", \"\\n\"); _tags.Add(\"p\", \"\\n\"); _tags.Add(\"/p\", \"\\n\"); _tags.Add(\"table\", \"\\n\"); _tags.Add(\"/table\", \"\\n\"); _tags.Add(\"ul\", \"\\n\"); _tags.Add(\"/ul\", \"\\n\"); _tags.Add(\"ol\", \"\\n\"); _tags.Add(\"/ol\", \"\\n\"); _tags.Add(\"/li\", \"\\n\"); _tags.Add(\"br\", \"\\n\"); _tags.Add(\"/td\", \"\\t\"); _tags.Add(\"/tr\", \"\\n\"); _tags.Add(\"/pre\", \"\\n\"); _ignoreTags = new HashSet<string>(); _ignoreTags.Add(\"script\"); _ignoreTags.Add(\"noscript\"); _ignoreTags.Add(\"style\"); _ignoreTags.Add(\"object\"); } /// <summary> /// Converts the given HTML to plain text and returns the result. /// </summary> /// <param name=\"html\">HTML to be converted</param> /// <returns>Resulting plain text</returns> public string Convert(string html) { // Initialize state variables _text = new TextBuilder(); _html = html; _pos = 0; // Process input while (!EndOfText) { if (Peek() == \'<\') { // HTML tag bool selfClosing; string tag = ParseTag(out selfClosing); // Handle special tag cases if (tag == \"body\") { // Discard content before <body> _text.Clear(); } else if (tag == \"/body\") { // Discard content after </body> _pos = _html.Length; } else if (tag == \"pre\") { // Enter preformatted mode _text.Preformatted = true; EatWhitespaceToNextLine(); } else if (tag == \"/pre\") { // Exit preformatted mode _text.Preformatted = false; } string value; if (_tags.TryGetValue(tag, out value)) _text.Write(value); if (_ignoreTags.Contains(tag)) EatInnerContent(tag); } else if (Char.IsWhiteSpace(Peek())) { // Whitespace (treat all as space) _text.Write(_text.Preformatted ? Peek() : \' \'); MoveAhead(); } else { // Other text _text.Write(Peek()); MoveAhead(); } } // Return result return HttpUtility.HtmlDecode(_text.ToString()); } // Eats all characters that are part of the current tag // and returns information about that tag protected string ParseTag(out bool selfClosing) { string tag = String.Empty; selfClosing = false; if (Peek() == \'<\') { MoveAhead(); // Parse tag name EatWhitespace(); int start = _pos; if (Peek() == \'/\') MoveAhead(); while (!EndOfText && !Char.IsWhiteSpace(Peek()) && Peek() != \'/\' && Peek() != \'>\') MoveAhead(); tag = _html.Substring(start, _pos - start).ToLower(); // Parse rest of tag while (!EndOfText && Peek() != \'>\') { if (Peek() == \'\"\' || Peek() == \'\\\'\') EatQuotedValue(); else { if (Peek() == \'/\') selfClosing = true; MoveAhead(); } } MoveAhead(); } return tag; } // Consumes inner content from the current tag protected void EatInnerContent(string tag) { string endTag = \"/\" + tag; while (!EndOfText) { if (Peek() == \'<\') { // Consume a tag bool selfClosing; if (ParseTag(out selfClosing) == endTag) return; // Use recursion to consume nested tags if (!selfClosing && !tag.StartsWith(\"/\")) EatInnerContent(tag); } else MoveAhead(); } } // Returns true if the current position is at the end of // the string protected bool EndOfText { get { return (_pos >= _html.Length); } } // Safely returns the character at the current position protected char Peek() { return (_pos < _html.Length) ? _html[_pos] : (char)0; } // Safely advances to current position to the next character protected void MoveAhead() { _pos = Math.Min(_pos + 1, _html.Length); } // Moves the current position to the next non-whitespace // character. protected void EatWhitespace() { while (Char.IsWhiteSpace(Peek())) MoveAhead(); } // Moves the current position to the next non-whitespace // character or the start of the next line, whichever // comes first protected void EatWhitespaceToNextLine() { while (Char.IsWhiteSpace(Peek())) { char c = Peek(); MoveAhead(); if (c == \'\\n\') break; } } // Moves the current position past a quoted value protected void EatQuotedValue() { char c = Peek(); if (c == \'\"\' || c == \'\\\'\') { // Opening quote MoveAhead(); // Find end of value int start = _pos; _pos = _html.IndexOfAny(new char[] { c, \'\\r\', \'\\n\' }, _pos); if (_pos < 0) _pos = _html.Length; else MoveAhead(); // Closing quote } } /// <summary> /// A StringBuilder class that helps eliminate excess whitespace. /// </summary> protected class TextBuilder { private StringBuilder _text; private StringBuilder _currLine; private int _emptyLines; private bool _preformatted; // Construction public TextBuilder() { _text = new StringBuilder(); _currLine = new StringBuilder(); _emptyLines = 0; _preformatted = false; } /// <summary> /// Normally, extra whitespace characters are discarded. /// If this property is set to true, they are passed /// through unchanged. /// </summary> public bool Preformatted { get { return _preformatted; } set { if (value) { // Clear line buffer if changing to // preformatted mode if (_currLine.Length > 0) FlushCurrLine(); _emptyLines = 0; } _preformatted = value; } } /// <summary> /// Clears all current text. /// </summary> public void Clear() { _text.Length = 0; _currLine.Length = 0; _emptyLines = 0; } /// <summary> /// Writes the given string to the output buffer. /// </summary> /// <param name=\"s\"></param> public void Write(string s) { foreach (char c in s) Write(c); } /// <summary> /// Writes the given character to the output buffer. /// </summary> /// <param name=\"c\">Character to write</param> public void Write(char c) { if (_preformatted) { // Write preformatted character _text.Append(c); } else { if (c == \'\\r\') { // Ignore carriage returns. We\'ll process // \'\\n\' if it comes next } else if (c == \'\\n\') { // Flush current line FlushCurrLine(); } else if (Char.IsWhiteSpace(c)) { // Write single space character int len = _currLine.Length; if (len == 0 || !Char.IsWhiteSpace(_currLine[len - 1])) _currLine.Append(\' \'); } else { // Add character to current line _currLine.Append(c); } } } // Appends the current line to output buffer protected void FlushCurrLine() { // Get current line string line = _currLine.ToString().Trim(); // Determine if line contains non-space characters string tmp = line.Replace(\" \", String.Empty); if (tmp.Length == 0) { // An empty line _emptyLines++; if (_emptyLines < 2 && _text.Length > 0) _text.AppendLine(line); } else { // A non-empty line _emptyLines = 0; _text.AppendLine(line); } // Reset current line _currLine.Length = 0; } /// <summary> /// Returns the current output as a string. /// </summary> public override string ToString() { if (_currLine.Length > 0) FlushCurrLine(); return _text.ToString(); } } }
希望本文所述对大家的C#程序设计有所帮助。
本文地址:https://www.stayed.cn/item/26322
转载请注明出处。
本站部分内容来源于网络,如侵犯到您的权益,请 联系我