gpt4 book ai didi

internet-explorer - 循环浏览网页并复制数据

转载 作者:行者123 更新时间:2023-12-03 18:28:40 25 4
gpt4 key购买 nike

我为一个 friend 创建了这个脚本,该 friend 在一个房地产网站上循环并为她获取电子邮件地址(用于促销)。该网站免费提供它们,但一次获取一个不方便。第一个脚本将每个页面的数据转储到一个名为 webdump 的 txt 文件中,第二个脚本从第一个 txt 文件中提取电子邮件地址。将其中的每一个保存在单独的 .vbs 文件中。如果要测试脚本,可能需要将以下内容更改为较低的数字(这是处理的页面数):

Do while i < 1334

第一个出错了,我不完全确定为什么,第二个提取的不仅仅是电子邮件地址,再一次,不完全确定为什么。我不是一个技术娴熟的 vbs 人,但这些问题与我的问题无关......底部的问题......
set ie = createobject("internetexplorer.application") 
Set objShell = CreateObject("WScript.Shell")
Dim i
i = 0

Do while i < 1334
i = i + 1

ie.navigate "http://www.reoagents.net/search-3.php?category=1&firmname=&business=&address=&zip=&phone=&fax=&mobile=&im=&manager=&mail=&www=&reserved_1=&reserved_2=&reserved_3=&filterbyday=ANY&loc_one=&loc_two=&loc_three=&loc_four=&location_text=&page="&i
do until ie.readystate = 4 : wscript.sleep 10: loop

pageText = ie.document.body.innertext

set fso = createobject("scripting.filesystemobject")
set ts = fso.opentextfile("c:\webdump.txt",8,true)
ts.write pageText
ts.close

loop

Wscript.Echo "All site data copied!"

第二部分:
Const ForReading = 1
Const ForWriting = 8

Set objRegEx = CreateObject("VBScript.RegExp")
objRegEx.Pattern = "@"

Set objFSO = CreateObject("Scripting.FileSystemObject")

'Input file
Set objFileIn = objFSO.OpenTextFile("C:\webdump.txt", ForReading)
strOutputFile = "C:\cleanaddress.txt"

Do Until objFileIn.AtEndOfStream
strSearchString = objFileIn.ReadLine
Set colMatches = objRegEx.Execute(strSearchString)
If colMatches.Count > 0 Then
For Each strMatch in colMatches
' Output File
Set objFileOut = objFSO.OpenTextFile(strOutputFile, ForWriting, True)

IF InStr(strSearchString," ") = 0 THEN
objFileOut.writeline strSearchString
ELSE
objFileOut.writeline Left(strSearchString,InStr(strSearchString," ")-1)


END IF
objFileOut.Close
Set objFileOut = Nothing

Next
End If
Loop

objFileIn.Close
Wscript.Echo "Done!"

由于地址的方式,我可以轻松地循环浏览该站点上的页面……地址的最后一个数字是连续的,但是,现在我想用这个地址尝试一下:

https://netforum.avectra.com/eweb/DynamicPage.aspx?Site=NEFAR&WebCode=IndResult&FromSearchControl=Yes&FromSearchControl=Yes ,

这似乎是基于java的。当我点击每个页面时,地址不会改变。在这种情况下,是否可以做类似于我在其他网站上所做的事情?

最佳答案

这是真正的绝地方法:) 仅使用 XMLHttpRequests ,没有 IE 的缺点或依赖。通过 mshta 动态创建的输出窗口没有临时文件。处理速度可以通过实现异步请求或多进程环境来提高。不幸的是,目前停止脚本的唯一方法是 wscript.exe进程终止。

Option Explicit

Dim oDisplay, sUrl, sRespHeaders, sRespText, arrSetHeaders, sEventTarget, arrFormData, lPage, lMember, i, arrFormStrings, sFormData, arrMembers, arrMemeber, sUrlEmail, sRespTextEmail, sEmail

Set oDisplay = New OutputWindow
sUrl = "https://netforum.avectra.com/eweb/DynamicPage.aspx?Site=NEFAR&WebCode=IndResult&FromSearchControl=Yes"
lPage = 0
lMember = 0

' Initial webpage request
oDisplay.Write("Connecting " & vbCrLf & sUrl)
XmlHttpRequest "GET", sUrl, Array(), "", sRespHeaders, sRespText

' Loop through all pages
Do
' Get cookies, form data, listctrl
oDisplay.Write("Processing page #" & (lPage + 1))
sEventTarget = ParseFragm("__doPostBack\('(ListControl_[\s\S]*?)',", sRespText)
ParseResponse "^Set-(Cookie): ([\S]*?=[\S]*?);[\s\S]*?$", sRespHeaders, arrSetHeaders
ParseResponse "<input type=""hidden"" name=""([\S]*?)""[\s\S]*?value=""([\s\S]*?)"" />", sRespText, arrFormData

' Update form params
For i = 0 To UBound(arrFormData)
Select Case arrFormData(i)(0)
Case "__POSTBACKCONTROL"
arrFormData(i)(1) = "JumpToPage"
Case "__EVENTTARGET"
arrFormData(i)(1) = sEventTarget
Case "__EVENTARGUMENT"
arrFormData(i)(1) = CStr(lPage)
End Select
Next

' Jump to page #lPage
arrFormStrings = Array()
ReDim arrFormStrings(UBound(arrFormData))
For i = 0 To UBound(arrFormData)
arrFormStrings(i) = EncodeUriComponent(arrFormData(i)(0)) & "=" & EncodeUriComponent(arrFormData(i)(1))
Next
sFormData = Join(arrFormStrings, "&")
PushItem arrSetHeaders, Array("Content-Type", "application/x-www-form-urlencoded")
PushItem arrSetHeaders, Array("Content-Length", CStr(Len(sFormData)))

' New page POST request
XmlHttpRequest "POST", sUrl, arrSetHeaders, sFormData, sRespHeaders, sRespText

' Parse members from new page
ParseMembers sRespText, arrMembers

' Parse members emails, and output
For Each arrMemeber in arrMembers
lMember = lMember + 1
sUrlEmail = "https://netforum.avectra.com/eweb/DynamicPage.aspx?Site=NEFAR&WebCode=PrimaryContactInfo&ind_cst_key=" & arrMemeber(0)
XmlHttpRequest "GET", sUrlEmail, Array(), "", "", sRespTextEmail
sEmail = ParseFragm("""mailto:([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6})""", sRespTextEmail)
oDisplay.WriteTable(Array(CStr(lMember), sEmail, arrMemeber(0), arrMemeber(1)))
Next

lPage = lPage + 1
Loop


Sub ParseResponse(sPattern, sResponse, arrData)
Dim oMatch
arrData = Array()
With CreateObject("VBScript.RegExp")
.Global = True
.MultiLine = True
.Pattern = sPattern
For Each oMatch In .Execute(sResponse)
PushItem arrData, Array(oMatch.SubMatches(0), oMatch.SubMatches(1))
Next
End With
End Sub

Function ParseFragm(sPattern, sResponse)
Dim oMatches
With CreateObject("VBScript.RegExp")
.Global = True
.MultiLine = True
.Pattern = sPattern
Set oMatches = .Execute(sResponse)
If oMatches.Count > 0 Then ParseFragm = oMatches(0).SubMatches(0)
End With
End Function

Sub ParseMembers(sRespText, arrMembers)
Dim oMatch
arrMembers = Array()
With CreateObject("VBScript.RegExp")
.Global = True
.MultiLine = True
.Pattern = "<td class[\s\S]*?>([\s\S]*?<[\s\S]*?Key=([\s\S]*?)&[\s\S]*?)</td>"
For Each oMatch In .Execute(sRespText)
PushItem arrMembers, Array(oMatch.SubMatches(1), GetInnerText(oMatch.SubMatches(0)))
Next
End With
End Sub

Sub PushItem(arrList, varItem)
ReDim Preserve arrList(UBound(arrList) + 1)
arrList(UBound(arrList)) = varItem
End Sub

Function EncodeUriComponent(sText)
With CreateObject("htmlfile")
.Write ("<script language='JScript'></script>")
EncodeUriComponent = .DocumentElement.Document.Script.EncodeUriComponent(sText)
End With
End Function

Function GetInnerText(sText)
With CreateObject("htmlfile")
.Write ("<body>" & sText & "</body>")
GetInnerText = .DocumentElement.Document.GetElementsByTagName("body")(0).InnerText
End With
End Function

Sub XmlHttpRequest(sMethod, sUrl, arrSetHeaders, sFormData, sRespHeaders, sRespText)
Dim arrHeader
With CreateObject("Msxml2.ServerXMLHTTP.3.0")
.SetOption 2, 13056 ' SXH_SERVER_CERT_IGNORE_ALL_SERVER_ERRORS
.Open sMethod, sUrl, False
For Each arrHeader In arrSetHeaders
.SetRequestHeader arrHeader(0), arrHeader(1)
Next
.Send sFormData
sRespHeaders = .GetAllResponseHeaders
sRespText = .ResponseText
End With
End Sub

Class OutputWindow

Dim oWnd, oDoc, oOutDiv, oCursorDiv, oOutTBody, sSignature, lCols

Private Sub Class_Initialize()
sSignature = "OutputWindow"
ProvideWindow()
End Sub

Private Sub ProvideWindow()
' source http://forum.script-coding.com/viewtopic.php?pid=75356#p75356
Dim lWidth, lHeight
GetWindow()
If oWnd Is Nothing Then
CreateWindow()
With oWnd
With .Document
.GetElementsByTagName("head")(0).AppendChild .CreateElement("style")
.stylesheets(0).cssText = "body, td, #output {font-family: consolas, courier new; font-size: 9pt;} #cursor {margin: 3px;} body {background-color: buttonface;} #output {height: 100%; width: 100%; overflow: scroll; background: #FFF;} div.hline {height: 1px; width: 100%; background-color: #000; overflow: hidden;} table {width: 100%; TEXT-ALIGN: center; border-COLLAPSE: collapse; background: transparent; margin-top: 1px;} td {border: black 1px solid;}"
.Title = "Output Window"
.Body.InnerHtml = "<div id='output'><div id='cursor'><img src='' /></div></div>"
End With
lWidth = CInt(.Screen.AvailWidth * 0.75)
lHeight = CInt(.Screen.AvailHeight * 0.75)
.ResizeTo .Screen.AvailWidth, .Screen.AvailHeight
.ResizeTo lWidth + .Screen.AvailWidth - .Document.Body.OffsetWidth, lHeight + .Screen.AvailHeight - .Document.Body.OffsetHeight
.MoveTo CInt((.Screen.AvailWidth - lWidth) / 2), CInt((.Screen.AvailHeight - lHeight) / 2)
End With
End If
Set oDoc = oWnd.Document
Set oOutDiv = oWnd.output
Set oCursorDiv = oWnd.cursor
lCols = -1
End Sub

Private Sub GetWindow()
Dim oShellWnd
On Error Resume Next
For Each oShellWnd In CreateObject("Shell.Application").Windows
Set oWnd = oShellWnd.GetProperty(sSignature)
If Err.Number = 0 Then Exit Sub
Err.Clear
Next
Set oWnd = Nothing
End Sub

Private Sub CreateWindow()
Dim oProc
Do
Set oProc = CreateObject("WScript.Shell").exec("mshta ""about:<head><script>moveTo(-32000,-32000);window.document.title=' ';</script><hta:application id=app border=dialog minimizebutton=no maximizebutton=no scroll=no showintaskbar=yes contextmenu=yes selection=yes innerborder=no /><object id='shellwindow' classid='clsid:8856F961-340A-11D0-A96B-00C04FD705A2'><param name=RegisterAsBrowser value=1></object><script>shellwindow.putproperty('" & sSignature & "',window);</script></head>""")
Do
If oProc.Status > 0 Then Exit Do
GetWindow()
If Not (oWnd Is Nothing) Then Exit Sub
Loop
Loop
End Sub

Private Sub ChkDoc()
On Error Resume Next
If TypeName(oDoc) <> "HTMLDocument" Then ProvideWindow()
End Sub

Public Sub Write(sText)
Dim oDiv
ChkDoc()
On Error Resume Next
Set oDiv = oDoc.CreateElement("div")
oDiv.InnerHtml = EscapeHtml(sText) & "<div class='hline'></div>"
oOutDiv.AppendChild oDiv
oOutDiv.AppendChild oCursorDiv
oOutDiv.ScrollTop = oOutDiv.ScrollHeight
lCols = -1
End Sub

Public Sub WriteTable(arrCells)
Dim sInner, oTable, oRow, oTr, oCell, n
ChkDoc()
On Error Resume Next
If UBound(arrCells) <> lCols Then
Set oTable = oDoc.CreateElement("table")
oOutDiv.AppendChild oTable
Set oOutTBody = oDoc.CreateElement("tbody")
oTable.AppendChild oOutTBody
lCols = UBound(arrCells)
End If
Set oTr = oDoc.CreateElement("tr")
oOutTBody.AppendChild oTr
For n = 0 To lCols
Set oCell = oTr.InsertCell(n)
oCell.InnerHtml = EscapeHtml(arrCells(n))
Next
oOutDiv.AppendChild oCursorDiv
oOutDiv.ScrollTop = oOutDiv.ScrollHeight
End Sub

Public Sub BreakTable()
lCols = -1
End Sub

Private Function EscapeHtml(sCnt)
Dim n
sCnt = Replace(sCnt, "&", "&amp;")
sCnt = Replace(sCnt, """", "&quot;")
sCnt = Replace(sCnt, "<", "&lt;")
sCnt = Replace(sCnt, ">", "&gt;")
sCnt = Replace(sCnt, "'", "&#39;")
sCnt = Replace(sCnt, vbCrLf, "<br>")
sCnt = Replace(sCnt, Chr(9), "&nbsp;&nbsp;&nbsp;&nbsp;")
sCnt = Replace(sCnt, " ", " &nbsp;")
sCnt = Replace(sCnt, "&nbsp; ", "&nbsp;&nbsp;")
For n = 0 To 31
sCnt = Replace(sCnt, Chr(n), "¶")
Next
EscapeHtml = sCnt
End Function

Private Sub Class_Terminate()
' oWnd.close
End Sub

End Class

关于internet-explorer - 循环浏览网页并复制数据,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/28384650/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com