gpt4 book ai didi

sql - 将 [每行一个单词] 与 [每行多个单词] 的短语行连接起来

转载 作者:行者123 更新时间:2023-12-03 00:27:00 25 4
gpt4 key购买 nike

请原谅问题的长度。我提供了一个测试脚本来演示情况以及我对解决方案的最佳尝试。

有两个表:

  1. test_WORDS = 从多个来源按顺序提取的单词。 OBJ_FK 列是源的 ID。 WORD_ID 是单词本身的标识符,在源中是唯一的。每行包含一个单词。
  2. test_PHRASE = 要在 test_WORDS 中搜索的短语列表。 PHRASE_TEXT 列是一个空格分隔的短语,例如“foo bar”(见下文),以便每行包含多个单词。

要求:返回 test_WORDS 中的第一个单词,该单词是 test_PHRASE 中匹配短语的开头。

我更喜欢基于一些设置来避免下面的 RBAR 方法。而且我的解决方案仅限于 5 个单词短语。我需要支持最多 20 个单词的短语。是否可以在没有光标的情况下将 test_PHRASE 中的一行中的单词与 test_WORD 中的连续行进行匹配?

将短语单词分解到临时表中后,问题归结为按行顺序将两个集合的部分匹配在一起。

-- Create test data
CREATE TABLE [dbo].[test_WORDS](
[OBJ_FK] [bigint] NOT NULL, --FK to the source object
[WORD_ID] [int] NOT NULL, --The word order in the source object
[WORD_TEXT] [nvarchar](50) NOT NULL,
CONSTRAINT [PK_test_WORDS] PRIMARY KEY CLUSTERED
(
[OBJ_FK] ASC,
[WORD_ID] ASC
)
) ON [PRIMARY]
GO

CREATE TABLE [dbo].[test_PHRASE](
[ID] [int], --PHRASE ID
[PHRASE_TEXT] [nvarchar](150) NOT NULL --Space-separated phrase
CONSTRAINT [PK_test_PHRASE] PRIMARY KEY CLUSTERED
(
[ID] ASC
)
)
GO
INSERT INTO dbo.test_WORDS
SELECT 1,1,'aaa' UNION ALL
SELECT 1,2,'bbb' UNION ALL
SELECT 1,3,'ccc' UNION ALL
SELECT 1,4,'ddd' UNION ALL
SELECT 1,5,'eee' UNION ALL
SELECT 1,6,'fff' UNION ALL
SELECT 1,7,'ggg' UNION ALL
SELECT 1,8,'hhh' UNION ALL
SELECT 2,1,'zzz' UNION ALL
SELECT 2,2,'yyy' UNION ALL
SELECT 2,3,'xxx' UNION ALL
SELECT 2,4,'www'

INSERT INTO dbo.test_PHRASE
SELECT 1, 'bbb ccc ddd' UNION ALL --should match
SELECT 2, 'ddd eee fff' UNION ALL --should match
SELECT 3, 'xxx xxx xxx' UNION ALL --should NOT match
SELECT 4, 'zzz yyy xxx' UNION ALL --should match
SELECT 5, 'xxx www ppp' UNION ALL --should NOT match
SELECT 6, 'zzz yyy xxx www' --should match

-- Create variables
DECLARE @maxRow AS INTEGER
DECLARE @currentRow AS INTEGER
DECLARE @phraseSubsetTable AS TABLE(
[ROW] int IDENTITY(1,1) NOT NULL,
[ID] int NOT NULL, --PHRASE ID
[PHRASE_TEXT] nvarchar(150) NOT NULL
)
--used to split the phrase into words
--note: No permissions to sys.dm_fts_parser
DECLARE @WordList table
(
ID int,
WORD nvarchar(50)
)
--Records to be returned to caller
DECLARE @returnTable AS TABLE(
OBJECT_FK INT NOT NULL,
WORD_ID INT NOT NULL,
PHRASE_ID INT NOT NULL
)
DECLARE @phrase AS NVARCHAR(150)
DECLARE @phraseID AS INTEGER

-- Get subset of phrases to simulate a join that would occur in production
INSERT INTO @phraseSubsetTable
SELECT ID, PHRASE_TEXT
FROM dbo.test_PHRASE
--represent subset of phrases caused by join in production
WHERE ID IN (2,3,4)

-- Loop each phrase in the subset, split into rows of words and return matches to the test_WORDS table
SET @maxRow = @@ROWCOUNT
SET @currentRow = 1
WHILE @currentRow <= @maxRow
BEGIN
SELECT @phrase=PHRASE_TEXT, @phraseID=ID FROM @phraseSubsetTable WHERE row = @currentRow

--clear previous phrase that was split into rows
DELETE FROM @WordList

--Recursive Function with CTE to create recordset of words, one per row
;WITH Pieces(pn, start, stop) AS (
SELECT 1, 1, CHARINDEX(' ', @phrase)
UNION ALL
SELECT pn + 1, stop + 1, CHARINDEX(' ', @phrase, stop + 1)
FROM Pieces
WHERE stop > 0)
--Create the List of words with the CTE above
insert into @WordList
SELECT pn,
SUBSTRING(@phrase, start, CASE WHEN stop > 0 THEN stop-start ELSE 1056 END) AS WORD
FROM Pieces

DECLARE @wordCt as int
select @wordCt=count(ID) from @WordList;

-- Do the actual query using a CTE with a rownumber that repeats for every SOURCE OBJECT
;WITH WordOrder_CTE AS (
SELECT OBJ_FK, WORD_ID, WORD_TEXT,
ROW_NUMBER() OVER (Partition BY OBJ_FK ORDER BY WORD_ID) AS rownum
FROM test_WORDS)
--CREATE a flattened record of the first word in the phrase and join it to the rest of the words.
INSERT INTO @returnTable
SELECT r1.OBJ_FK, r1.WORD_ID, @phraseID AS PHRASE_ID
FROM WordOrder_CTE r1
INNER JOIN @WordList w1 ON r1.WORD_TEXT = w1.WORD and w1.ID=1
LEFT JOIN WordOrder_CTE r2
ON r1.rownum = r2.rownum - 1 and r1.OBJ_FK = r2.OBJ_FK
LEFT JOIN @WordList w2 ON r2.WORD_TEXT = w2.WORD and w2.ID=2
LEFT JOIN WordOrder_CTE r3
ON r1.rownum = r3.rownum - 2 and r1.OBJ_FK = r3.OBJ_FK
LEFT JOIN @WordList w3 ON r3.WORD_TEXT = w3.WORD and w3.ID=3
LEFT JOIN WordOrder_CTE r4
ON r1.rownum = r4.rownum - 3 and r1.OBJ_FK = r4.OBJ_FK
LEFT JOIN @WordList w4 ON r4.WORD_TEXT = w4.WORD and w4.ID=4
LEFT JOIN WordOrder_CTE r5
ON r1.rownum = r5.rownum - 4 and r1.OBJ_FK = r5.OBJ_FK
LEFT JOIN @WordList w5 ON r5.WORD_TEXT = w5.WORD and w5.ID=5

WHERE (@wordCt < 2 OR w2.ID is not null) and
(@wordCt < 3 OR w3.ID is not null) and
(@wordCt < 4 OR w4.ID is not null) and
(@wordCt < 5 OR w5.ID is not null)

--loop
SET @currentRow = @currentRow+1
END

--Return the first words of each matching phrase
SELECT OBJECT_FK, WORD_ID, PHRASE_ID FROM @returnTable

GO

--Clean up
DROP TABLE [dbo].[test_WORDS]
DROP TABLE [dbo].[test_PHRASE]

编辑后的解决方案:

这是对下面提供的正确解决方案的编辑,以考虑非连续的单词 ID。希望这对某人有帮助,就像对我一样。

;WITH
numberedwords AS (
SELECT
OBJ_FK,
WORD_ID,
WORD_TEXT,
rowcnt = ROW_NUMBER() OVER
(PARTITION BY OBJ_FK ORDER BY WORD_ID DESC),
totalInSrc = COUNT(WORD_ID) OVER (PARTITION BY OBJ_FK)
FROM dbo.test_WORDS
),
phrasedwords AS (
SELECT
nw1.OBJ_FK,
nw1.WORD_ID,
nw1.WORD_TEXT,
PHRASE_TEXT = RTRIM((
SELECT [text()] = nw2.WORD_TEXT + ' '
FROM numberedwords nw2
WHERE nw1.OBJ_FK = nw2.OBJ_FK
AND nw2.rowcnt BETWEEN nw1.rowcnt AND nw1.totalInSrc
ORDER BY nw2.OBJ_FK, nw2.WORD_ID
FOR XML PATH ('')
))
FROM numberedwords nw1
GROUP BY nw1.OBJ_FK, nw1.WORD_ID, nw1.WORD_TEXT, nw1.rowcnt, nw1.totalInSrc
)
SELECT *
FROM phrasedwords pw
INNER JOIN test_PHRASE tp
ON LEFT(pw.PHRASE_TEXT, LEN(tp.PHRASE_TEXT)) = tp.PHRASE_TEXT
ORDER BY pw.OBJ_FK, pw.WORD_ID

注意:我在生产中使用的最终查询使用索引临时表而不是 CTE。我还根据我的需要限制了 PHRASE_TEXT 列的长度。通过这些改进,我能够将查询时间从 3 分钟以上减少到 3 秒!

最佳答案

这里有一个使用不同方法的解决方案:不是将短语拆分为单词,而是将单词组合为短语。

已编辑:将 rowcnt 表达式更改为使用 COUNT(*) OVER ...,如 @ErikE 的建议。在评论中。

;WITH
numberedwords AS (
SELECT
OBJ_FK,
WORD_ID,
WORD_TEXT,
rowcnt = COUNT(*) OVER (PARTITION BY OBJ_FK)
FROM dbo.test_WORDS
),
phrasedwords AS (
SELECT
nw1.OBJ_FK,
nw1.WORD_ID,
nw1.WORD_TEXT,
PHRASE_TEXT = RTRIM((
SELECT [text()] = nw2.WORD_TEXT + ' '
FROM numberedwords nw2
WHERE nw1.OBJ_FK = nw2.OBJ_FK
AND nw2.WORD_ID BETWEEN nw1.WORD_ID AND nw1.rowcnt
ORDER BY nw2.OBJ_FK, nw2.WORD_ID
FOR XML PATH ('')
))
FROM numberedwords nw1
GROUP BY nw1.OBJ_FK, nw1.WORD_ID, nw1.WORD_TEXT, nw1.rowcnt
)
SELECT *
FROM phrasedwords pw
INNER JOIN test_PHRASE tp
ON LEFT(pw.PHRASE_TEXT, LEN(tp.PHRASE_TEXT)) = tp.PHRASE_TEXT
ORDER BY pw.OBJ_FK, pw.WORD_ID

关于sql - 将 [每行一个单词] 与 [每行多个单词] 的短语行连接起来,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/4929662/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com