gpt4 book ai didi

sql - 如何在 LEFT JOIN 的 ON 子句中使用字符串函数与 BigQuery 中的标准 SQL?

转载 作者:行者123 更新时间:2023-12-04 02:55:40 26 4
gpt4 key购买 nike

LEFT JOIN ON 中使用字符串函数(例如 STARTS_WITH)或运算符(例如 LIKE)时我遇到了问题> 其中任一个的参数来自连接中的两个表。我没有尝试解释摘要,而是提炼了一个小例子......

让我们考虑一个名为 fuzzylog 的表,它有一个关键字段 fullname,我想通过加入一个 names 表来规范化同一列。 fuzzylog 中的关键字段可能有点困惑或格式不固定,因此无法直接进行相等连接。这些表可能看起来像这样:

模糊日志表:

fuzzylog table

names 表:

names table

名称表试图通过提供一个 contains 字段来适应模糊性,我想在完全匹配失败时使用 LIKE 加入该字段:

#standardSQL
SELECT l.id, n.fullname, n.nameid,
l.fullname AS logged_fullname
FROM `neilotemp.fuzzylog` l
LEFT JOIN `neilotemp.names` n
ON l.fullname = n.fullname
OR l.fullname LIKE CONCAT('%', n.contains, '%')

不幸的是,我真正需要的最后一行导致了错误:LEFT OUTER JOIN cannot be used without a condition that is equality of fields from both sides of join. 这确实是我要解决的问题。

我已经进行了相当多的搜索,但一直没能找到任何让我印象深刻的东西。我必须从正确的道路上走下概念上的一步。

最佳答案

老实说,我认为在这里使用 contains 字段并不是最好的主意
相反,请考虑使用 Levenshtein [编辑] 距离的以下方法

#standardSQL
CREATE TEMPORARY FUNCTION EDIT_DISTANCE(string1 STRING, string2 STRING)
RETURNS INT64
LANGUAGE js AS """
var _extend = function(dst) {
var sources = Array.prototype.slice.call(arguments, 1);
for (var i=0; i<sources.length; ++i) {
var src = sources[i];
for (var p in src) {
if (src.hasOwnProperty(p)) dst[p] = src[p];
}
}
return dst;
};

var Levenshtein = {
/**
* Calculate levenshtein distance of the two strings.
*
* @param str1 String the first string.
* @param str2 String the second string.
* @return Integer the levenshtein distance (0 and above).
*/
get: function(str1, str2) {
// base cases
if (str1 === str2) return 0;
if (str1.length === 0) return str2.length;
if (str2.length === 0) return str1.length;

// two rows
var prevRow = new Array(str2.length + 1),
curCol, nextCol, i, j, tmp;

// initialise previous row
for (i=0; i<prevRow.length; ++i) {
prevRow[i] = i;
}

// calculate current row distance from previous row
for (i=0; i<str1.length; ++i) {
nextCol = i + 1;

for (j=0; j<str2.length; ++j) {
curCol = nextCol;

// substution
nextCol = prevRow[j] + ( (str1.charAt(i) === str2.charAt(j)) ? 0 : 1 );
// insertion
tmp = curCol + 1;
if (nextCol > tmp) {
nextCol = tmp;
}
// deletion
tmp = prevRow[j + 1] + 1;
if (nextCol > tmp) {
nextCol = tmp;
}

// copy current col value into previous (in preparation for next iteration)
prevRow[j] = curCol;
}

// copy last col value into previous (in preparation for next iteration)
prevRow[j] = nextCol;
}

return nextCol;
}

};

var the_string1;

try {
the_string1 = decodeURI(string1).toLowerCase();
} catch (ex) {
the_string1 = string1.toLowerCase();
}

try {
the_string2 = decodeURI(string2).toLowerCase();
} catch (ex) {
the_string2 = string2.toLowerCase();
}

return Levenshtein.get(the_string1, the_string2)

""";
WITH notrmalized_fuzzylog as (
select id, fullname,
(select string_agg(part, ' ' order by part) from unnest(split(fullname, ' ')) part) ordered_fullname
from `project.dataset.fuzzylog`
), normalized_names as (
select nameid, fullname,
(select string_agg(part, ' ' order by part) from unnest(split(fullname, ' ')) part) ordered_fullname
from `project.dataset.names`
)
select
id, l.fullname AS logged_fullname,
ARRAY_AGG(
STRUCT(n.nameid, n.fullname)
ORDER BY EDIT_DISTANCE(l.ordered_fullname, n.ordered_fullname) LIMIT 1
)[OFFSET(0)].*
FROM notrmalized_fuzzylog l
CROSS JOIN normalized_names n
GROUP BY 1, 2

您可以使用问题中的虚拟数据进行测试,如下所示

#standardSQL
CREATE TEMPORARY FUNCTION EDIT_DISTANCE(string1 STRING, string2 STRING)
RETURNS INT64
LANGUAGE js AS """
var _extend = function(dst) {
var sources = Array.prototype.slice.call(arguments, 1);
for (var i=0; i<sources.length; ++i) {
var src = sources[i];
for (var p in src) {
if (src.hasOwnProperty(p)) dst[p] = src[p];
}
}
return dst;
};

var Levenshtein = {
/**
* Calculate levenshtein distance of the two strings.
*
* @param str1 String the first string.
* @param str2 String the second string.
* @return Integer the levenshtein distance (0 and above).
*/
get: function(str1, str2) {
// base cases
if (str1 === str2) return 0;
if (str1.length === 0) return str2.length;
if (str2.length === 0) return str1.length;

// two rows
var prevRow = new Array(str2.length + 1),
curCol, nextCol, i, j, tmp;

// initialise previous row
for (i=0; i<prevRow.length; ++i) {
prevRow[i] = i;
}

// calculate current row distance from previous row
for (i=0; i<str1.length; ++i) {
nextCol = i + 1;

for (j=0; j<str2.length; ++j) {
curCol = nextCol;

// substution
nextCol = prevRow[j] + ( (str1.charAt(i) === str2.charAt(j)) ? 0 : 1 );
// insertion
tmp = curCol + 1;
if (nextCol > tmp) {
nextCol = tmp;
}
// deletion
tmp = prevRow[j + 1] + 1;
if (nextCol > tmp) {
nextCol = tmp;
}

// copy current col value into previous (in preparation for next iteration)
prevRow[j] = curCol;
}

// copy last col value into previous (in preparation for next iteration)
prevRow[j] = nextCol;
}

return nextCol;
}

};

var the_string1;

try {
the_string1 = decodeURI(string1).toLowerCase();
} catch (ex) {
the_string1 = string1.toLowerCase();
}

try {
the_string2 = decodeURI(string2).toLowerCase();
} catch (ex) {
the_string2 = string2.toLowerCase();
}

return Levenshtein.get(the_string1, the_string2)

""";
WITH `project.dataset.fuzzylog` AS (
SELECT 1 id, 'John Smith' fullname UNION ALL
SELECT 2, 'Jane Doe' UNION ALL
SELECT 3, 'Ms. Jane Doe' UNION ALL
SELECT 4, 'Mr. John Smith' UNION ALL
SELECT 5, 'Smith, John' UNION ALL
SELECT 6, 'J.Smith' UNION ALL
SELECT 7, 'J. Doe'
), `project.dataset.names` AS (
SELECT 1 nameid, 'John Smith' fullname, 'smith' match UNION ALL
SELECT 2, 'Jane Doe', 'doe'
), notrmalized_fuzzylog as (
select id, fullname,
(select string_agg(part, ' ' order by part) from unnest(split(fullname, ' ')) part) ordered_fullname
from `project.dataset.fuzzylog`
), normalized_names as (
select nameid, fullname,
(select string_agg(part, ' ' order by part) from unnest(split(fullname, ' ')) part) ordered_fullname
from `project.dataset.names`
)
select
id, l.fullname AS logged_fullname,
ARRAY_AGG(
STRUCT(n.nameid, n.fullname)
ORDER BY EDIT_DISTANCE(l.ordered_fullname, n.ordered_fullname) LIMIT 1
)[OFFSET(0)].*
FROM notrmalized_fuzzylog l
CROSS JOIN normalized_names n
GROUP BY 1, 2
-- ORDER BY 1

结果:

Row id  logged_fullname nameid  fullname     
1 1 John Smith 1 John Smith
2 2 Jane Doe 2 Jane Doe
3 3 Ms. Jane Doe 2 Jane Doe
4 4 Mr. John Smith 1 John Smith
5 5 Smith, John 1 John Smith
6 6 J.Smith 1 John Smith
7 7 J. Doe 2 Jane Doe

正如您在该解决方案中看到的那样,我们完全忽略/删除了任何额外的人工列(如 contains)的使用,而是应用 Levenshtein 距离直接测量两个全名之间的相似性。正如您在执行此操作之前看到的那样,我们对全名进行重新排序/规范化以对其部分进行排序
如果这种方法对您有用 - 您应该考虑通过首先删除/替换所有标点符号(如点、逗号等)和空格来改进重新排序以获得更好的结果

关于sql - 如何在 LEFT JOIN 的 ON 子句中使用字符串函数与 BigQuery 中的标准 SQL?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/53198374/

26 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com