gpt4 book ai didi

ruby-on-rails - Nokogiri 使用 Ruby On Rails 进行抓取无法按预期工作

转载 作者:数据小太阳 更新时间:2023-10-29 08:57:12 24 4
gpt4 key购买 nike

我对 Ruby on Rails 完全陌生,但我想我可能遗漏了一些明显的东西。我目前正在开发一个抓取拍卖网站的网络应用程序。该应用程序的骨骼是由其他人创建的。我目前正在尝试添加新的网站抓取,但它们似乎不起作用。

我已经阅读了一些 Nokogiri 文档,检查了抓取的信息确实没有被写入数据库(当我通过 Rails 控制台检查时,被定位的种子 URL 已经被写入)并使用了 chrome 扩展 CSS Selector Tester 检查我是否以正确的 CSS 选择器为目标。当我通过 Rails 控制台检查时,记录 ID 是正确的。

我将我认为重要的代码部分放在下面,但我可能会遗漏一些我认为不重要的东西。

我遇到问题的网站是 Lot-art.com & Lot-Tissimo.com

任何帮助将不胜感激。

种子网址

Source.create(name: "Auction.fr", query_template: "https://www.auction.fr/_en/lot/search/?contexte=futures&tri=date_debut%20ASC&query={query}&page={page}")
Source.create(name: "Invaluable.co.uk", query_template: "https://www.invaluable.co.uk/search/api/search-results?keyword={query}&size=1000")
Source.create(name: "Interencheres.com", query_template: "http://www.interencheres.com/en/recherche/lot?search%5Bkeyword%5D={query}&page={page}")
Source.create(name: "Gazette-drouot.com", query_template: "http://catalogue.gazette-drouot.com/html/g/recherche.jsp?numPage={page}&filterDate=1&query={query}&npp=100")
Source.create(name: "Lot-art.com", query_template: "http://www.lot-art.com/auction-search/?form_id=lot_search_form&page=1&mq=&q={query}&ord=recent")
Source.create(name: "Lot-tissimo.com", query_template: "https://lot-tissimo.com/en/cmd=s&lwr=&ww={query}&xw=&srt=SN&wg=EUR&page={page}")

调度程序代码

require 'rufus-scheduler'

require 'nokogiri'
require 'mechanize'
require 'open-uri'
require "net/https"


s = Rufus::Scheduler.singleton


s.interval '1m' do
setting = Setting.find(1)
agent = Mechanize.new

agent.user_agent_alias = 'Windows Chrome'

agent.cookie_jar.load(File.join(Rails.root, 'tmp/cookies.yaml'))
List.all.each do |list|
number_of_new_items = 0

list.actions.each do |action|
url = action.source.query_template.gsub('{query}', action.list.query)

case action.source.id
when 1 # Auction.fr
20.downto(1) do |page|
doc = Nokogiri::HTML(open(url.gsub('{page}', page.to_s)))

doc.css("div.list-products > ul > li").reverse.each do |item_data|

price = 0
if item_data.at_css("h3.h4.adjucation.ft-blue") && /Selling price : ([\d\s]+) €/.match(item_data.at_css("h3.h4.adjucation.ft-blue").text)
price = /Selling price : ([\d\s]+) €/.match(item_data.at_css("h3.h4.adjucation.ft-blue").text)[1].gsub(" ", "")
end

item = action.items.new(
title: item_data.at_css("h2").text.strip,
url: item_data.at_css("h2 a")["href"],
picture: item_data.at_css("div.image-wrap.lazy div.image img")["src"],
price: price,
currency: "€"
)

ActiveRecord::Base.logger.silence do # This disable writing logs
if item.save
number_of_new_items = number_of_new_items + 1
end
end

end
end

when 97 # Lot-Tissimo.com
5.downto(1) do |page|
doc = Nokogiri::HTML(open(url.gsub('{page}', page.to_s)))

doc.css("#inhalt > .objektliste").reverse.each do |item_data|

# price = 0
# if item_data.at_css("h3.h4.adjucation.ft-blue") && /Selling price : ([\d\s]+) €/.match(item_data.at_css("h3.h4.adjucation.ft-blue").text)
# price = /Selling price : ([\d\s]+) €/.match(item_data.at_css("h3.h4.adjucation.ft-blue").text)[1].gsub(" ", "")
# end

item = action.items.new(
title: item_data.at_css("div.objli-desc").text.strip,
url: item_data.at_css("td.objektliste-foto a")["href"],
picture: item_data.at_css("td.objektliste-foto a#lot_link img")["src"],
price: price,
currency: "€"
)

ActiveRecord::Base.logger.silence do # This disable writing logs
if item.save
number_of_new_items = number_of_new_items + 1
end
end


end
end

when 2 # Invaluable.co.uk
doc = JSON.parse(open(url).read)

doc["itemViewList"].reverse.each do |item_data|

puts item_data["itemView"]["photos"]

item = action.items.new(
title: item_data["itemView"]["title"],
url: "https://www.invaluable.co.uk/buy-now/" + item_data["itemView"]["title"].parameterize + "-" + item_data["itemView"]["ref"],
picture: item_data["itemView"]["photos"] != nil ? item_data["itemView"]["photos"].first["_links"]["medium"]["href"] : nil,
price: item_data["itemView"]["price"],
currency: item_data["itemView"]["currencySymbol"]
)

ActiveRecord::Base.logger.silence do # This disable writing logs
if item.save
number_of_new_items = number_of_new_items + 1
end
end

end



when 3 # Interencheres.com

# doc = Nokogiri::HTML(open(url))
5.downto(1) do |page|
doc = Nokogiri::HTML(open(url.gsub('{page}', page.to_s)))

doc.css("div#lots_0 div.ligne_vente").reverse.each do |item_data|

price = 0


item = action.items.new(
title: item_data.at_css("div.ph_vente div.des_vente p a").text.strip,
url: "http://www.interencheres.com" + item_data.at_css("div.ph_vente div.des_vente p a")["href"],
picture: item_data.at_css("div.ph_vente div.gd_ph_vente img")["src"],
price: price,
currency: "€"
)

ActiveRecord::Base.logger.silence do # This disable writing logs
if item.save
number_of_new_items = number_of_new_items + 1
end
end

end
end

when 4 # Gazette-drouot.com

5.downto(1) do |page|
# doc = Nokogiri::HTML(open(url.gsub('{page}', page.to_s)))
doc = agent.get(url.gsub('{page}', page.to_s))
# doc = agent.get(url)
doc.css("div#recherche_resultats div.lot_recherche").reverse.each do |item_data|

price = 0

picture = item_data.at_css("img.image_thumb_recherche") ? item_data.at_css("img.image_thumb_recherche")["src"] : nil
item = action.items.new(
title: item_data.at_css("#des_recherche").text.strip.truncate(140),
url: "http://catalogue.gazette-drouot.com/html/g/" + item_data.at_css("a.lien_under")["href"],
picture: picture,
price: price,
currency: "€"
)

ActiveRecord::Base.logger.silence do # This disable writing logs
if item.save
number_of_new_items = number_of_new_items + 1
end
end
end

end

when 69 # Lot-art.com

doc = agent.get(url)
doc.css("div.lot_list_holder").reverse.each do |item_data|

price = 0

item = action.items.new(
title: item_data.at_css("div.lot_list_body a")[0].text.strip.truncate(140),
url: item_data.at_css("div.lot_list_body")["href"],
picture: item_data.at_css("a.lot_list_thumb img") ["src"],
price: price,
currency: "€"
)

ActiveRecord::Base.logger.silence do # This disable writing logs
if item.save
number_of_new_items = number_of_new_items + 1
end
end


end

end

end

if number_of_new_items > 0 && setting.notifications_per_hour > setting.notifications_this_hour && setting.pushover_app_token.present? && setting.pushover_user_key.present?
url = URI.parse("https://api.pushover.net/1/messages.json")
req = Net::HTTP::Post.new(url.path)
req.set_form_data({
:token => setting.pushover_app_token,
:user => setting.pushover_user_key,
:message => "#{number_of_new_items} new items on #{list.name}!",
:url_title => "Check the list",
:url => "http://spottheauction.com/lists/#{list.id}"
})
res = Net::HTTP.new(url.host, url.port)
res.use_ssl = true
res.verify_mode = OpenSSL::SSL::VERIFY_PEER
res.start {|http| http.request(req) }
end
end
agent.cookie_jar.save(File.join(Rails.root, 'tmp/cookies.yaml'))
end

s.cron '0 * * * *' do
setting = Setting.find(1)
setting.notifications_this_hour = 0
setting.save
end

最佳答案

new 只初始化一个实例,但不保存实例。你真的在某处调用了 save 吗?

你有两个选择:

在项目上调用保存:

item = action.items.new(
# ...
)
item.save

或者使用create代替new:

item = action.items.create(
# ...
)

关于ruby-on-rails - Nokogiri 使用 Ruby On Rails 进行抓取无法按预期工作,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/49382344/

24 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com