gpt4 book ai didi

c# - 除了计算距离之外,如何衡量 2 个字符串的相似性

转载 作者:塔克拉玛干 更新时间:2023-11-03 03:09:41 27 4
gpt4 key购买 nike

我正在创建一个程序来检查单词是否为简化字(txt、msg 等),如果是简化字,它会找到正确的拼写,如 txt=text、msg=message。我在 c# 中使用 NHunspell 建议方法,它建议所有可能的结果。

问题是,如果我输入“txt”,结果是text、tat、tot 等。我不知道如何选择正确的词。我使用了 Levenshtein 距离 ( C# - Compare String Similarity ),但结果仍然为 1。

输入:txt结果:text = 1, ext = 1 tit = 1

您能帮我了解一下简化词的含义或正确拼写吗?示例:消息

最佳答案

我已经用您的示例数据测试了您的输入,只有 text 的距离为 25,而另一个的距离为 33。这是我的代码:

string input = "TXT";
string[] words = new[]{"text","tat","tot"};
var levenshtein = new Levenshtein();
const int maxDistance = 30;

var distanceGroups = words
.Select(w => new
{
Word = w,
Distance = levenshtein.iLD(w.ToUpperInvariant(), input)
})
.Where(x => x.Distance <= maxDistance)
.GroupBy(x => x.Distance)
.OrderBy(g => g.Key)
.ToList();
foreach (var topCandidate in distanceGroups.First())
Console.WriteLine("Word:{0} Distance:{1}", topCandidate.Word, topCandidate.Distance);

这是 levenshtein 类:

public class Levenshtein
{
///*****************************
/// Compute Levenshtein distance
/// Memory efficient version
///*****************************
public int iLD(String sRow, String sCol)
{
int RowLen = sRow.Length; // length of sRow
int ColLen = sCol.Length; // length of sCol
int RowIdx; // iterates through sRow
int ColIdx; // iterates through sCol
char Row_i; // ith character of sRow
char Col_j; // jth character of sCol
int cost; // cost

/// Test string length
if (Math.Max(sRow.Length, sCol.Length) > Math.Pow(2, 31))
throw (new Exception("\nMaximum string length in Levenshtein.iLD is " + Math.Pow(2, 31) + ".\nYours is " + Math.Max(sRow.Length, sCol.Length) + "."));

// Step 1

if (RowLen == 0)
{
return ColLen;
}

if (ColLen == 0)
{
return RowLen;
}

/// Create the two vectors
int[] v0 = new int[RowLen + 1];
int[] v1 = new int[RowLen + 1];
int[] vTmp;



/// Step 2
/// Initialize the first vector
for (RowIdx = 1; RowIdx <= RowLen; RowIdx++)
{
v0[RowIdx] = RowIdx;
}

// Step 3

/// Fore each column
for (ColIdx = 1; ColIdx <= ColLen; ColIdx++)
{
/// Set the 0'th element to the column number
v1[0] = ColIdx;

Col_j = sCol[ColIdx - 1];


// Step 4

/// Fore each row
for (RowIdx = 1; RowIdx <= RowLen; RowIdx++)
{
Row_i = sRow[RowIdx - 1];


// Step 5

if (Row_i == Col_j)
{
cost = 0;
}
else
{
cost = 1;
}

// Step 6

/// Find minimum
int m_min = v0[RowIdx] + 1;
int b = v1[RowIdx - 1] + 1;
int c = v0[RowIdx - 1] + cost;

if (b < m_min)
{
m_min = b;
}
if (c < m_min)
{
m_min = c;
}

v1[RowIdx] = m_min;
}

/// Swap the vectors
vTmp = v0;
v0 = v1;
v1 = vTmp;

}

// Step 7

/// Value between 0 - 100
/// 0==perfect match 100==totaly different
///
/// The vectors where swaped one last time at the end of the last loop,
/// that is why the result is now in v0 rather than in v1
//System.Console.WriteLine("iDist=" + v0[RowLen]);
int max = System.Math.Max(RowLen, ColLen);
return ((100 * v0[RowLen]) / max);
}


///*****************************
/// Compute the min
///*****************************

private int Minimum(int a, int b, int c)
{
int mi = a;

if (b < mi)
{
mi = b;
}
if (c < mi)
{
mi = c;
}

return mi;
}

///*****************************
/// Compute Levenshtein distance
///*****************************
public int LD(String sNew, String sOld)
{
int[,] matrix; // matrix
int sNewLen = sNew.Length; // length of sNew
int sOldLen = sOld.Length; // length of sOld
int sNewIdx; // iterates through sNew
int sOldIdx; // iterates through sOld
char sNew_i; // ith character of sNew
char sOld_j; // jth character of sOld
int cost; // cost

/// Test string length
if (Math.Max(sNew.Length, sOld.Length) > Math.Pow(2, 31))
throw (new Exception("\nMaximum string length in Levenshtein.LD is " + Math.Pow(2, 31) + ".\nYours is " + Math.Max(sNew.Length, sOld.Length) + "."));

// Step 1

if (sNewLen == 0)
{
return sOldLen;
}

if (sOldLen == 0)
{
return sNewLen;
}

matrix = new int[sNewLen + 1, sOldLen + 1];

// Step 2

for (sNewIdx = 0; sNewIdx <= sNewLen; sNewIdx++)
{
matrix[sNewIdx, 0] = sNewIdx;
}

for (sOldIdx = 0; sOldIdx <= sOldLen; sOldIdx++)
{
matrix[0, sOldIdx] = sOldIdx;
}

// Step 3

for (sNewIdx = 1; sNewIdx <= sNewLen; sNewIdx++)
{
sNew_i = sNew[sNewIdx - 1];

// Step 4

for (sOldIdx = 1; sOldIdx <= sOldLen; sOldIdx++)
{
sOld_j = sOld[sOldIdx - 1];

// Step 5

if (sNew_i == sOld_j)
{
cost = 0;
}
else
{
cost = 1;
}

// Step 6

matrix[sNewIdx, sOldIdx] = Minimum(matrix[sNewIdx - 1, sOldIdx] + 1, matrix[sNewIdx, sOldIdx - 1] + 1, matrix[sNewIdx - 1, sOldIdx - 1] + cost);

}
}

// Step 7

/// Value between 0 - 100
/// 0==perfect match 100==totaly different
//System.Console.WriteLine("Dist=" + matrix[sNewLen, sOldLen]);
int max = System.Math.Max(sNewLen, sOldLen);
return (100 * matrix[sNewLen, sOldLen]) / max;
}
}

关于c# - 除了计算距离之外,如何衡量 2 个字符串的相似性,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/17748969/

27 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com