c# - 如何在 C# 中将 rtf 字符串转换为文本-6ren

c# - 如何在 C# 中将 rtf 字符串转换为文本

转载作者：可可西里更新时间：2023-11-01 09:13:14

有没有一种简单的方法可以在不使用 RichTextBox 的情况下从 Rtf 字符串中提取文本？？

例子:

{\rtf1\ansi\ansicpg1252\uc1\htmautsp\deff2{\fonttbl{\f0\fcharset0 Times New Roman;}{\f2\fcharset0 Segoe UI;}}{\colortbl\red0\green0\blue0;\red255\green255\blue255;}\loch\hich\dbch\pard\plain\ltrpar\itap0{\lang1033\fs18\f2\cf0 \cf0\ql{\f2 {\lang2070\ltrch foo}\li0\ri0\sa0\sb0\fi0\ql\par} 
{\f2 {\lang2070\ltrch bar }\li0\ri0\sa0\sb0\fi0\ql\par}
}
}

应该返回:

foo 
bar

最佳答案

如何在不引用其他库的情况下用纯 C# 实现:

这个人写了一个类，按照 OP 的要求将 RTF 剥离为纯文本。这是 source

这是他的代码:

    /// <summary>
    /// Rich Text Stripper
    /// </summary>
    /// <remarks>
    /// Translated from Python located at:
    /// http://stackoverflow.com/a/188877/448
    /// </remarks>
    public static class RichTextStripper
    {
        private class StackEntry
        {
            public int NumberOfCharactersToSkip { get; set; }
            public bool Ignorable { get; set; }

            public StackEntry(int numberOfCharactersToSkip, bool ignorable)
            {
                NumberOfCharactersToSkip = numberOfCharactersToSkip;
                Ignorable = ignorable;
            }
        }

        private static readonly Regex _rtfRegex = new Regex(@"\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'([0-9a-f]{2})|\\([^a-z])|([{}])|[\r\n]+|(.)", RegexOptions.Singleline | RegexOptions.IgnoreCase);

        private static readonly List<string> destinations = new List<string>
    {
        "aftncn","aftnsep","aftnsepc","annotation","atnauthor","atndate","atnicn","atnid",
        "atnparent","atnref","atntime","atrfend","atrfstart","author","background",
        "bkmkend","bkmkstart","blipuid","buptim","category","colorschememapping",
        "colortbl","comment","company","creatim","datafield","datastore","defchp","defpap",
        "do","doccomm","docvar","dptxbxtext","ebcend","ebcstart","factoidname","falt",
        "fchars","ffdeftext","ffentrymcr","ffexitmcr","ffformat","ffhelptext","ffl",
        "ffname","ffstattext","field","file","filetbl","fldinst","fldrslt","fldtype",
        "fname","fontemb","fontfile","fonttbl","footer","footerf","footerl","footerr",
        "footnote","formfield","ftncn","ftnsep","ftnsepc","g","generator","gridtbl",
        "header","headerf","headerl","headerr","hl","hlfr","hlinkbase","hlloc","hlsrc",
        "hsv","htmltag","info","keycode","keywords","latentstyles","lchars","levelnumbers",
        "leveltext","lfolevel","linkval","list","listlevel","listname","listoverride",
        "listoverridetable","listpicture","liststylename","listtable","listtext",
        "lsdlockedexcept","macc","maccPr","mailmerge","maln","malnScr","manager","margPr",
        "mbar","mbarPr","mbaseJc","mbegChr","mborderBox","mborderBoxPr","mbox","mboxPr",
        "mchr","mcount","mctrlPr","md","mdeg","mdegHide","mden","mdiff","mdPr","me",
        "mendChr","meqArr","meqArrPr","mf","mfName","mfPr","mfunc","mfuncPr","mgroupChr",
        "mgroupChrPr","mgrow","mhideBot","mhideLeft","mhideRight","mhideTop","mhtmltag",
        "mlim","mlimloc","mlimlow","mlimlowPr","mlimupp","mlimuppPr","mm","mmaddfieldname",
        "mmath","mmathPict","mmathPr","mmaxdist","mmc","mmcJc","mmconnectstr",
        "mmconnectstrdata","mmcPr","mmcs","mmdatasource","mmheadersource","mmmailsubject",
        "mmodso","mmodsofilter","mmodsofldmpdata","mmodsomappedname","mmodsoname",
        "mmodsorecipdata","mmodsosort","mmodsosrc","mmodsotable","mmodsoudl",
        "mmodsoudldata","mmodsouniquetag","mmPr","mmquery","mmr","mnary","mnaryPr",
        "mnoBreak","mnum","mobjDist","moMath","moMathPara","moMathParaPr","mopEmu",
        "mphant","mphantPr","mplcHide","mpos","mr","mrad","mradPr","mrPr","msepChr",
        "mshow","mshp","msPre","msPrePr","msSub","msSubPr","msSubSup","msSubSupPr","msSup",
        "msSupPr","mstrikeBLTR","mstrikeH","mstrikeTLBR","mstrikeV","msub","msubHide",
        "msup","msupHide","mtransp","mtype","mvertJc","mvfmf","mvfml","mvtof","mvtol",
        "mzeroAsc","mzeroDesc","mzeroWid","nesttableprops","nextfile","nonesttables",
        "objalias","objclass","objdata","object","objname","objsect","objtime","oldcprops",
        "oldpprops","oldsprops","oldtprops","oleclsid","operator","panose","password",
        "passwordhash","pgp","pgptbl","picprop","pict","pn","pnseclvl","pntext","pntxta",
        "pntxtb","printim","private","propname","protend","protstart","protusertbl","pxe",
        "result","revtbl","revtim","rsidtbl","rxe","shp","shpgrp","shpinst",
        "shppict","shprslt","shptxt","sn","sp","staticval","stylesheet","subject","sv",
        "svb","tc","template","themedata","title","txe","ud","upr","userprops",
        "wgrffmtfilter","windowcaption","writereservation","writereservhash","xe","xform",
        "xmlattrname","xmlattrvalue","xmlclose","xmlname","xmlnstbl",
        "xmlopen"
    };

        private static readonly Dictionary<string, string> specialCharacters = new Dictionary<string, string>
    {
        { "par", "\n" },
        { "sect", "\n\n" },
        { "page", "\n\n" },
        { "line", "\n" },
        { "tab", "\t" },
        { "emdash", "\u2014" },
        { "endash", "\u2013" },
        { "emspace", "\u2003" },
        { "enspace", "\u2002" },
        { "qmspace", "\u2005" },
        { "bullet", "\u2022" },
        { "lquote", "\u2018" },
        { "rquote", "\u2019" },
        { "ldblquote", "\u201C" },
        { "rdblquote", "\u201D" },
    };
        /// <summary>
        /// Strip RTF Tags from RTF Text
        /// </summary>
        /// <param name="inputRtf">RTF formatted text</param>
        /// <returns>Plain text from RTF</returns>
        public static string StripRichTextFormat(string inputRtf)
        {
            if (inputRtf == null)
            {
                return null;
            }

            string returnString;

            var stack = new Stack<StackEntry>();
            bool ignorable = false;              // Whether this group (and all inside it) are "ignorable".
            int ucskip = 1;                      // Number of ASCII characters to skip after a unicode character.
            int curskip = 0;                     // Number of ASCII characters left to skip
            var outList = new List<string>();    // Output buffer.

            MatchCollection matches = _rtfRegex.Matches(inputRtf);

            if (matches.Count > 0)
            {
                foreach (Match match in matches)
                {
                    string word = match.Groups[1].Value;
                    string arg = match.Groups[2].Value;
                    string hex = match.Groups[3].Value;
                    string character = match.Groups[4].Value;
                    string brace = match.Groups[5].Value;
                    string tchar = match.Groups[6].Value;

                    if (!String.IsNullOrEmpty(brace))
                    {
                        curskip = 0;
                        if (brace == "{")
                        {
                            // Push state
                            stack.Push(new StackEntry(ucskip, ignorable));
                        }
                        else if (brace == "}")
                        {
                            // Pop state
                            StackEntry entry = stack.Pop();
                            ucskip = entry.NumberOfCharactersToSkip;
                            ignorable = entry.Ignorable;
                        }
                    }
                    else if (!String.IsNullOrEmpty(character)) // \x (not a letter)
                    {
                        curskip = 0;
                        if (character == "~")
                        {
                            if (!ignorable)
                            {
                                outList.Add("\xA0");
                            }
                        }
                        else if ("{}\\".Contains(character))
                        {
                            if (!ignorable)
                            {
                                outList.Add(character);
                            }
                        }
                        else if (character == "*")
                        {
                            ignorable = true;
                        }
                    }
                    else if (!String.IsNullOrEmpty(word)) // \foo
                    {
                        curskip = 0;
                        if (destinations.Contains(word))
                        {
                            ignorable = true;
                        }
                        else if (ignorable)
                        {
                        }
                        else if (specialCharacters.ContainsKey(word))
                        {
                            outList.Add(specialCharacters[word]);
                        }
                        else if (word == "uc")
                        {
                            ucskip = Int32.Parse(arg);
                        }
                        else if (word == "u")
                        {
                            int c = Int32.Parse(arg);
                            if (c < 0)
                            {
                                c += 0x10000;
                            }
                            outList.Add(Char.ConvertFromUtf32(c));
                            curskip = ucskip;
                        }
                    }
                    else if (!String.IsNullOrEmpty(hex)) // \'xx
                    {
                        if (curskip > 0)
                        {
                            curskip -= 1;
                        }
                        else if (!ignorable)
                        {
                            int c = Int32.Parse(hex, System.Globalization.NumberStyles.HexNumber);
                            outList.Add(Char.ConvertFromUtf32(c));
                        }
                    }
                    else if (!String.IsNullOrEmpty(tchar))
                    {
                        if (curskip > 0)
                        {
                            curskip -= 1;
                        }
                        else if (!ignorable)
                        {
                            outList.Add(tchar);
                        }
                    }
                }
            }
            else
            {
                // Didn't match the regex
                returnString = inputRtf;
            }

            returnString = String.Join(String.Empty, outList.ToArray());

            return returnString;
        }
    }

编辑 1:与此同时，我们让这段代码在生产中运行以进行测试和改编版本。新版本进行了一些额外的安全检查并更好地处理新行。

public static string StripRichTextFormat(string inputRtf)
    {
        if (inputRtf == null)
        {
            return null;
        }

        string returnString;

        var stack = new Stack<StackEntry>();
        bool ignorable = false;              // Whether this group (and all inside it) are "ignorable".
        int ucskip = 1;                      // Number of ASCII characters to skip after a unicode character.
        int curskip = 0;                     // Number of ASCII characters left to skip
        var outList = new List<string>();    // Output buffer.

        MatchCollection matches = _rtfRegex.Matches(inputRtf);

        if (matches.Count > 0)
        {
            foreach (Match match in matches)
            {
                string word = match.Groups[1].Value;
                string arg = match.Groups[2].Value;
                string hex = match.Groups[3].Value;
                string character = match.Groups[4].Value;
                string brace = match.Groups[5].Value;
                string tchar = match.Groups[6].Value;

                if (!String.IsNullOrEmpty(brace))
                {
                    curskip = 0;
                    if (brace == "{")
                    {
                        // Push state
                        stack.Push(new StackEntry(ucskip, ignorable));
                    }
                    else if (brace == "}")
                    {
                        // Pop state
                        StackEntry entry = stack.Pop();
                        ucskip = entry.NumberOfCharactersToSkip;
                        ignorable = entry.Ignorable;
                    }
                }
                else if (!String.IsNullOrEmpty(character)) // \x (not a letter)
                {
                    curskip = 0;
                    if (character == "~")
                    {
                        if (!ignorable)
                        {
                            outList.Add("\xA0");
                        }
                    }
                    else if ("{}\\".Contains(character))
                    {
                        if (!ignorable)
                        {
                            outList.Add(character);
                        }
                    }
                    else if (character == "*")
                    {
                        ignorable = true;
                    }
                }
                else if (!String.IsNullOrEmpty(word)) // \foo
                {
                    curskip = 0;
                    if (destinations.Contains(word))
                    {
                        ignorable = true;
                    }
                    else if (ignorable)
                    {
                    }
                    else if (specialCharacters.ContainsKey(word))
                    {
                        outList.Add(specialCharacters[word]);
                    }
                    else if (word == "uc")
                    {
                        ucskip = Int32.Parse(arg);
                    }
                    else if (word == "u")
                    {
                        int c = Int32.Parse(arg);
                        if (c < 0)
                        {
                            c += 0x10000;
                        }
                        //Ein gültiger UTF32-Wert ist zwischen 0x000000 und 0x10ffff (einschließlich) und sollte keine Ersatzcodepunktwerte (0x00d800 ~ 0x00dfff)
                        if (c >= 0x000000 && c <= 0x10ffff && (c < 0x00d800 || c > 0x00dfff))
                            outList.Add(Char.ConvertFromUtf32(c));
                        else outList.Add("?");
                        curskip = ucskip;
                    }
                }
                else if (!String.IsNullOrEmpty(hex)) // \'xx
                {
                    if (curskip > 0)
                    {
                        curskip -= 1;
                    }
                    else if (!ignorable)
                    {
                        int c = Int32.Parse(hex, System.Globalization.NumberStyles.HexNumber);
                        outList.Add(Char.ConvertFromUtf32(c));
                    }
                }
                else if (!String.IsNullOrEmpty(tchar))
                {
                    if (curskip > 0)
                    {
                        curskip -= 1;
                    }
                    else if (!ignorable)
                    {
                        outList.Add(tchar);
                    }
                }
            }
        }
        else
        {
            // Didn't match the regex
            returnString = inputRtf;
        }

        returnString = String.Join(String.Empty, outList.ToArray());

        return returnString;
    }

关于c# - 如何在 C# 中将 rtf 字符串转换为文本，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/5634525/

文章推荐： node.js - Node.js async eachLimit 在这种情况下如何工作？

文章推荐： Android HTTPPost 返回错误 "Method not allowed."

文章推荐： android - 使用配对的 Google 帐户作为您的 Android 应用登录

文章推荐： mongodb - 为什么MongoDB的本地数据库几乎全在内存中？

c++ - 将迭代器存储到字符串中(转换、转换、追加？)
我正在尝试将一个字符串逐个字符地复制到另一个字符串中。目的不是复制整个字符串，而是复制其中的一部分(我稍后会为此做一些条件......) 但我不知道如何使用迭代器。你能帮帮我吗？ std::stri
C++:转换/转换 void 指针到结构引用
我想将 void 指针转换为结构引用。结构的最小示例: #include "Interface.h" class Foo { public: Foo() : mAddress((uint
javascript - 为什么一个元素从窗口的左上角开始它的 css3 转换/转换？
这有点烦人:我有一个 div，它从窗口的左上角开始过渡，即使它位于文档的其他任何位置。我试过 usign -webkit-transform-origin 但没有成功，也许我用错了。有人可以帮助我吗？
html - 有什么方法可以检测 CSS3 转换/转换/动画的过程状态？
假设，如果将 CSS3 转换/转换/动画分配给 DOM 元素，我是否可以检测到该过程的状态？我想这样做的原因是因为我正在寻找类似过渡链的东西，例如，在前一个过渡之后运行一个过渡。最佳答案我在 h
CSS 转换/转换 - 谷歌浏览器中的 "shaky"图像
最近我遇到了“不稳定”屏幕，这很可能是由 CSS 转换引起的。事实上，它只发生在 Chrome 浏览器上(可能还有 Safari，因为一些人也报告了它)。知道如何让它看起来光滑吗？此外，您可能会注意
jquery - CSS3 转换(转换)在 Firefox 中不起作用，但在 Chrome 和 Safari 中起作用
我正在开发一个简单的 slider ，它使用 CSS 过渡来为幻灯片设置动画。我用一些基本样式和一些 javascript 创建了一支笔 here .注意:由于 Codepen 使用 Prefixfr
Linq 转换
我正在使用以下代码返回 IList: public IList FindCodesByCountry(string country) { var query =
转换、计算的RESTful设计
如何设计像这样的操作: 计算转化翻译例如:从“EUR”转换为“CNY”金额“100”。这是 /convert?from=EUR&to=CNY&amount=100 RESTful 吗？最佳答
Jquery 转换
我使用 jquery 组合了一个图像滚动器，如下所示 function rotateImages(whichHolder, start) { var images = $('#' +which
CSS 转换
如何使用 CSS (-moz-transform) 更改一个如下所示的 div: 最佳答案你可以看看Mozilla Developer Center .甚至还有例子。但是，在我看来，您的具体示例不
CSS 转换
我需要帮助我正在尝试在选中和未选中的汉堡菜单上实现动画。我能够为菜单设置动画，但我不知道如何在转换为 0 时为左菜单动画设置动画 &__menu { transform: translateX(
swift :转换
我正在为字典格式之间的转换而苦苦挣扎:我正在尝试将下面的项目数组转换为下面的结果数组。本质上是通过在项目第一个元素中查找重复项，然后仅在第一个参数不同时才将文件添加到结果集中。 var items:[
具有相同布局的不同类型之间的C++转换
如果我有两个定义相同的结构，那么在它们之间进行转换的最佳方式是什么？ struct A { int i; float f; }; struct B { int i; float f; }; void
Javascript 转换
我编写了一个 javascript 代码，可以将视口(viewport)从一个链接滑动到另一个链接。基本上一切正常，你怎么能在那里看到http://jsfiddle.net/DruwJ/8/ 我现在的
meteorjs 图像上传/转换
我需要将文件上传到 meteor ，对其进行一些图像处理(必要时进行图像转换，从图像生成缩略图)，然后将其存储在外部图像存储服务器(s3)中。这应该尽可能快。您对 nodejs 图像处理库有什么建议
KDB+，转换，左操作数
刚开始接触KDB+，有一些问题很难从Q for Mortals中得到。说，这里 http://code.kx.com/wiki/JB:QforMortals2/casting_and_enumera
JSF float 转换
我在这里的一个项目中使用 JSF 1.2 和 IceFaces 1.8。我有一个页面，它基本上是一大堆浮点数字段的大编辑网格。这是通过 inputText 实现的页面上的字段指向具有原始值的值对象
SceneKit 转换 ScnMatrix4
ScnMatrix4 是一个 4x4 矩阵。我的问题是什么矩阵行对应于位置(ScnVector3)，旋转(ScnVector4)，比例(ScnVector3)。第 4 行是空的吗？编辑: 我玩弄了
Scala map 转换
恐怕我是 Scala 新手: 我正在尝试根据一些简单的逻辑将 Map 转换为新 Map: val postVals = Map("test" -> "testing1", "test2" -> "te
基于配置文件的 XSLT 转换
输入: This is sample 1 This is sample 2 输出: ~COLOR~[Green]This is sample 1~COLOR~[Red]This is sam

可可西里

个人简介

我是一名优秀的程序员,十分优秀！

作者热门文章

滴滴打车优惠券免费领取

全站热门文章

首页

博学

6Ren·AI

商城

c# - 如何在 C# 中将 rtf 字符串转换为文本