gpt4 book ai didi

c# - 如何在 C# 中将 rtf 字符串转换为文本

转载 作者:可可西里 更新时间:2023-11-01 09:13:14 26 4
gpt4 key购买 nike

有没有一种简单的方法可以在不使用 RichTextBox 的情况下从 Rtf 字符串中提取文本? ?


{\rtf1\ansi\ansicpg1252\uc1\htmautsp\deff2{\fonttbl{\f0\fcharset0 Times New Roman;}{\f2\fcharset0 Segoe UI;}}{\colortbl\red0\green0\blue0;\red255\green255\blue255;}\loch\hich\dbch\pard\plain\ltrpar\itap0{\lang1033\fs18\f2\cf0 \cf0\ql{\f2 {\lang2070\ltrch foo}\li0\ri0\sa0\sb0\fi0\ql\par} 
{\f2 {\lang2070\ltrch bar }\li0\ri0\sa0\sb0\fi0\ql\par}




如何在不引用其他库的情况下用纯 C# 实现:

这个人写了一个类,按照 OP 的要求将 RTF 剥离为纯文本。这是 source


    /// <summary>
/// Rich Text Stripper
/// </summary>
/// <remarks>
/// Translated from Python located at:
/// </remarks>
public static class RichTextStripper
private class StackEntry
public int NumberOfCharactersToSkip { get; set; }
public bool Ignorable { get; set; }

public StackEntry(int numberOfCharactersToSkip, bool ignorable)
NumberOfCharactersToSkip = numberOfCharactersToSkip;
Ignorable = ignorable;

private static readonly Regex _rtfRegex = new Regex(@"\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'([0-9a-f]{2})|\\([^a-z])|([{}])|[\r\n]+|(.)", RegexOptions.Singleline | RegexOptions.IgnoreCase);

private static readonly List<string> destinations = new List<string>

private static readonly Dictionary<string, string> specialCharacters = new Dictionary<string, string>
{ "par", "\n" },
{ "sect", "\n\n" },
{ "page", "\n\n" },
{ "line", "\n" },
{ "tab", "\t" },
{ "emdash", "\u2014" },
{ "endash", "\u2013" },
{ "emspace", "\u2003" },
{ "enspace", "\u2002" },
{ "qmspace", "\u2005" },
{ "bullet", "\u2022" },
{ "lquote", "\u2018" },
{ "rquote", "\u2019" },
{ "ldblquote", "\u201C" },
{ "rdblquote", "\u201D" },
/// <summary>
/// Strip RTF Tags from RTF Text
/// </summary>
/// <param name="inputRtf">RTF formatted text</param>
/// <returns>Plain text from RTF</returns>
public static string StripRichTextFormat(string inputRtf)
if (inputRtf == null)
return null;

string returnString;

var stack = new Stack<StackEntry>();
bool ignorable = false; // Whether this group (and all inside it) are "ignorable".
int ucskip = 1; // Number of ASCII characters to skip after a unicode character.
int curskip = 0; // Number of ASCII characters left to skip
var outList = new List<string>(); // Output buffer.

MatchCollection matches = _rtfRegex.Matches(inputRtf);

if (matches.Count > 0)
foreach (Match match in matches)
string word = match.Groups[1].Value;
string arg = match.Groups[2].Value;
string hex = match.Groups[3].Value;
string character = match.Groups[4].Value;
string brace = match.Groups[5].Value;
string tchar = match.Groups[6].Value;

if (!String.IsNullOrEmpty(brace))
curskip = 0;
if (brace == "{")
// Push state
stack.Push(new StackEntry(ucskip, ignorable));
else if (brace == "}")
// Pop state
StackEntry entry = stack.Pop();
ucskip = entry.NumberOfCharactersToSkip;
ignorable = entry.Ignorable;
else if (!String.IsNullOrEmpty(character)) // \x (not a letter)
curskip = 0;
if (character == "~")
if (!ignorable)
else if ("{}\\".Contains(character))
if (!ignorable)
else if (character == "*")
ignorable = true;
else if (!String.IsNullOrEmpty(word)) // \foo
curskip = 0;
if (destinations.Contains(word))
ignorable = true;
else if (ignorable)
else if (specialCharacters.ContainsKey(word))
else if (word == "uc")
ucskip = Int32.Parse(arg);
else if (word == "u")
int c = Int32.Parse(arg);
if (c < 0)
c += 0x10000;
curskip = ucskip;
else if (!String.IsNullOrEmpty(hex)) // \'xx
if (curskip > 0)
curskip -= 1;
else if (!ignorable)
int c = Int32.Parse(hex, System.Globalization.NumberStyles.HexNumber);
else if (!String.IsNullOrEmpty(tchar))
if (curskip > 0)
curskip -= 1;
else if (!ignorable)
// Didn't match the regex
returnString = inputRtf;

returnString = String.Join(String.Empty, outList.ToArray());

return returnString;

编辑 1:与此同时,我们让这段代码在生产中运行以进行测试和改编版本。新版本进行了一些额外的安全检查并更好地处理新行。

public static string StripRichTextFormat(string inputRtf)
if (inputRtf == null)
return null;

string returnString;

var stack = new Stack<StackEntry>();
bool ignorable = false; // Whether this group (and all inside it) are "ignorable".
int ucskip = 1; // Number of ASCII characters to skip after a unicode character.
int curskip = 0; // Number of ASCII characters left to skip
var outList = new List<string>(); // Output buffer.

MatchCollection matches = _rtfRegex.Matches(inputRtf);

if (matches.Count > 0)
foreach (Match match in matches)
string word = match.Groups[1].Value;
string arg = match.Groups[2].Value;
string hex = match.Groups[3].Value;
string character = match.Groups[4].Value;
string brace = match.Groups[5].Value;
string tchar = match.Groups[6].Value;

if (!String.IsNullOrEmpty(brace))
curskip = 0;
if (brace == "{")
// Push state
stack.Push(new StackEntry(ucskip, ignorable));
else if (brace == "}")
// Pop state
StackEntry entry = stack.Pop();
ucskip = entry.NumberOfCharactersToSkip;
ignorable = entry.Ignorable;
else if (!String.IsNullOrEmpty(character)) // \x (not a letter)
curskip = 0;
if (character == "~")
if (!ignorable)
else if ("{}\\".Contains(character))
if (!ignorable)
else if (character == "*")
ignorable = true;
else if (!String.IsNullOrEmpty(word)) // \foo
curskip = 0;
if (destinations.Contains(word))
ignorable = true;
else if (ignorable)
else if (specialCharacters.ContainsKey(word))
else if (word == "uc")
ucskip = Int32.Parse(arg);
else if (word == "u")
int c = Int32.Parse(arg);
if (c < 0)
c += 0x10000;
//Ein gültiger UTF32-Wert ist zwischen 0x000000 und 0x10ffff (einschließlich) und sollte keine Ersatzcodepunktwerte (0x00d800 ~ 0x00dfff)
if (c >= 0x000000 && c <= 0x10ffff && (c < 0x00d800 || c > 0x00dfff))
else outList.Add("?");
curskip = ucskip;
else if (!String.IsNullOrEmpty(hex)) // \'xx
if (curskip > 0)
curskip -= 1;
else if (!ignorable)
int c = Int32.Parse(hex, System.Globalization.NumberStyles.HexNumber);
else if (!String.IsNullOrEmpty(tchar))
if (curskip > 0)
curskip -= 1;
else if (!ignorable)
// Didn't match the regex
returnString = inputRtf;

returnString = String.Join(String.Empty, outList.ToArray());

return returnString;

关于c# - 如何在 C# 中将 rtf 字符串转换为文本,我们在Stack Overflow上找到一个类似的问题:

26 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号