gpt4 book ai didi

asp.net-mvc-3 - 如何使用asp.net mvc3 和c# 构建网络爬虫?

转载 作者:行者123 更新时间:2023-12-02 05:30:46 36 4
gpt4 key购买 nike

我需要使用 ASP.NET MVC 3 构建像 Google 这样的小型搜索引擎。为此,我还需要构建网络爬虫来为搜索引擎填充数据。总之,我需要以下东西:

如何使用 C# 使用 Asp.net MVC3 构建网络爬虫。我开始构建一些基础知识,但我需要一些指南、资源和书籍。

使用此网络爬虫,我为搜索引擎填充数据并使用 Lucene.Net 对数据应用索引。我需要构建小型搜索引擎来搜索网络爬虫填充的数据。

请任何人有任何想法或资源或书籍。请与我们分享。

最佳答案

我使用 Asp.net 4.0(不是任何 MVC)和 C# 创建了网络爬虫和搜索。我将在下面发布有关它的详细信息。它可能会对某人有所帮助。1.创建具有以下名称和字段的表。名称:tblCrawlingData字段:Id 类型:Int,Url 类型:文本,标题类型:文本,内容类型:文本2.我正在使用 Linq 对数据库进行 sql。所以请让 linq to sql 设置你自己。3. 我正在使用 Lucene.net 库索引。 Lucene.Net类(Lucene.Net.dll)

    using System;
using System.Collections.Generic;
using System.Linq;
using System.Web;
using System.IO;
using DataManager.BOL;
using Lucene.Net.Analysis.Standard;
using Lucene.Net.Index;
using Lucene.Net.Store;
using Lucene.Net.Search;
using Lucene.Net.Documents;
using Lucene.Net.QueryParsers;
using System.Diagnostics;
namespace AryaSearch.LuceneSearch{
public class LuceneSearch
{
//Total Records
private static long _totalResutl = 0;
//Total Time taken to retrive the record
private static double _totalTime = 0;
// Full physical path of lucene index directory.
private static string _luceneDir = Path.Combine(HttpContext.Current.Request.PhysicalApplicationPath, "IndexStorage");
private static FSDirectory _directoryTemp;

private static FSDirectory _directory
{
get
{
if (_directoryTemp == null)
_directoryTemp = FSDirectory.Open(new DirectoryInfo(_luceneDir));
if (IndexWriter.IsLocked(_directoryTemp))
IndexWriter.Unlock(_directoryTemp);
var lockFilePath = Path.Combine(_luceneDir, "write.lock");
if (File.Exists(lockFilePath))
File.Delete(lockFilePath);

return _directoryTemp;
}
}


private static void _addToLuceneIndex(CrawlingData spiderData, IndexWriter writer)
{
// remove older index entry
var searchQuery = new TermQuery(new Term("Id", spiderData.Id.ToString()));
writer.DeleteDocuments(searchQuery);
//add new index entry
var doc = new Document();
// add lucene fields mapped to db fields
doc.Add(new Field("Id", spiderData.Id.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.Add(new Field("Url", spiderData.Url, Field.Store.YES, Field.Index.ANALYZED));
doc.Add(new Field("Title", spiderData.Title, Field.Store.YES, Field.Index.ANALYZED));
doc.Add(new Field("Contents", spiderData.Contents, Field.Store.YES, Field.Index.ANALYZED));

// add entry to index
writer.AddDocument(doc);
}



public static void AddUpdateLuceneIndex(IEnumerable<CrawlingData> spiderDatas)
{
// initialize lucene
var analyzer = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30);
using (var writer = new IndexWriter(_directory, analyzer, IndexWriter.MaxFieldLength.UNLIMITED))
{
// add data to lucene search index (replaces older entry if any) by calling method _addToLuceneIndex
foreach (var spiderData in spiderDatas)
_addToLuceneIndex(spiderData, writer);

// close all method and handlers
analyzer.Close();
//writer.Close();
writer.Dispose();


}
}


public static void AddUpdateLuceneIndex(CrawlingData spiderDatas)
{
AddUpdateLuceneIndex(new List<CrawlingData> { spiderDatas });
}


public static void ClearLuceneIndexRecord(int record_id)
{
//intialize lucene classs
var analyzer = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30);
using (var writer = new IndexWriter(_directory, analyzer, IndexWriter.MaxFieldLength.UNLIMITED))
{
// remove the older or deleted entry from lucene index
var searchQuery = new TermQuery(new Term("Id", record_id.ToString()));
writer.DeleteDocuments(searchQuery);

//close the handlers
analyzer.Close();
//writer.Close();
writer.Dispose();
}
}


public static bool ClearLuceneIndex()
{
try
{
//intialize lucene classs
var analyzer = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30);
using (var writer = new IndexWriter(_directory, analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED))
{
// remove all older index entries
writer.DeleteAll();
//close handlers
analyzer.Close();
//writer.Close();
writer.Dispose();
}


}
catch (Exception)
{
return false;
}
return true;
}



public static void Optimize()
{
//intialize lucene classs
var analyzer = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30);
using (var writer = new IndexWriter(_directory, analyzer, IndexWriter.MaxFieldLength.UNLIMITED))
{
//close all handlers
analyzer.Close();
writer.Optimize();
//writer.Close();
writer.Dispose();
}


}


private static CrawlingData _mapLuceneDocumentToData(Document doc)
{
try
{
return new CrawlingData
{
Id = Convert.ToInt32(doc.Get("Id")),
Url = doc.Get("Url"),
Title = doc.Get("Title"),
Contents = doc.Get("Contents").Substring(0,500)+"...",
Total= _totalResutl,
Time=_totalTime
};
}
catch (Exception ex)
{
return null;
}

}



private static IEnumerable<CrawlingData> _mapLuceneToDataList(IEnumerable<Document> hits)
{

return hits.Select(_mapLuceneDocumentToData).ToList();
}



private static IEnumerable<CrawlingData> _mapLuceneToDataList(IEnumerable<ScoreDoc> hits, IndexSearcher searcher)
{

return hits.Select(hit => _mapLuceneDocumentToData(searcher.Doc(hit.Doc))).ToList();

}



private static Query parseQuery(string searchQuery, QueryParser parser)
{
Query query;
try
{
query = parser.Parse(searchQuery.Trim());
}
catch (ParseException)
{
query = parser.Parse(QueryParser.Escape(searchQuery.Trim()));
}

return query;
}

private static IEnumerable<CrawlingData> _search(string searchQuery, string searchField = "")
{
//validate the input
if (string.IsNullOrEmpty(searchQuery.Replace("*", "").Replace("?", "")))
return new List<CrawlingData>();
// set up lucene searcher
using (var searcher = new IndexSearcher(_directory, false))
{
//when Lucene gets more than 1000 search results,
//it becomes increasingly slow, so you'd want to limit it to a number which is relevant in your case
//var hits_limit = 1000;
var hits_limit = 10;
//initialize the lucene class.
var analyzer = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30);
// search by single field
if (!string.IsNullOrEmpty(searchField))
{
var parser = new QueryParser(Lucene.Net.Util.Version.LUCENE_30, searchField, analyzer);
var query = parseQuery(searchQuery, parser);
//var hits = searcher.Search(query, hits_limit).ScoreDocs;
Stopwatch watch = Stopwatch.StartNew();
var hitsTemp = searcher.Search(query, hits_limit);
watch.Stop();
_totalTime = watch.ElapsedMilliseconds / 1000;
_totalTime = Math.Round(Convert.ToDouble(_totalTime), 2);
_totalResutl = hitsTemp.TotalHits;
var hits = hitsTemp.ScoreDocs;
var results = _mapLuceneToDataList(hits, searcher);// _mapLuceneSearchResultsToDataList(hits, searcher);
analyzer.Close();
//searcher.Close();
searcher.Dispose();
return results;
}
// search by multiple fields (ordered by RELEVANCE). you can search order here.
else
{
var parser = new MultiFieldQueryParser
(Lucene.Net.Util.Version.LUCENE_30, new[] { "Id", "Url", "Title", "Contents" }, analyzer);
var query = parseQuery(searchQuery, parser);
Stopwatch watch = Stopwatch.StartNew();
var hitsTemp = searcher.Search(query, null, hits_limit, Sort.RELEVANCE);
watch.Stop();
_totalTime = (float)watch.ElapsedMilliseconds / 1000;
_totalTime = Math.Round(Convert.ToDouble(_totalTime), 2);
_totalResutl = hitsTemp.TotalHits;
Sort.RELEVANCE).ScoreDocs;
var hits = hitsTemp.ScoreDocs;
var results = _mapLuceneToDataList(hits, searcher);
analyzer.Close();
//searcher.Close();
searcher.Dispose();
return results;
}
}
}



public static IEnumerable<CrawlingData> Search(string input, string fieldName = "")
{
if (string.IsNullOrEmpty(input)) return new List<CrawlingData>();

var terms = input.Trim().Replace("-", " ").Split(' ')
.Where(x => !string.IsNullOrEmpty(x)).Select(x => x.Trim() + "*");
input = string.Join(" ", terms);

return _search(input, fieldName);
}



public static IEnumerable<CrawlingData> SearchDefault(string input, string fieldName = "")
{
return string.IsNullOrEmpty(input) ? new List<CrawlingData>() : _search(input, fieldName);
}



public static IEnumerable<CrawlingData> GetAllIndexRecords()
{

if (!System.IO.Directory.EnumerateFiles(_luceneDir).Any()) return new List<CrawlingData>();


var searcher = new IndexSearcher(_directory, false);
var reader = IndexReader.Open(_directory, false);
var docs = new List<Document>();
var term = reader.TermDocs();
while (term.Next()) docs.Add(searcher.Doc(term.Doc));
reader.Dispose();
//searcher.Close();
searcher.Dispose();
return _mapLuceneToDataList(docs);
}

}
}

4.爬虫功能。我正在使用 HtmlAgilityPack.dll 来解析 html 文本。抓取(Default.aspx)

    <%@ Page Title="Home Page" Language="C#" MasterPageFile="~/Site.master" AutoEventWireup="true"
CodeFile="Default.aspx.cs" Inherits="_Default" %>

<asp:Content ID="HeaderContent" runat="server" ContentPlaceHolderID="HeadContent">
</asp:Content>
<asp:Content ID="BodyContent" runat="server" ContentPlaceHolderID="MainContent">
<div class="crawlText">
<h2 class="customAlignMent">
<font color="red">A</font><font color="blue">R</font><font color="black">Y</font><font
color="green">A</font>
</h2>
<asp:UpdatePanel ID="UpdatePanel1" runat="server" UpdateMode="Conditional">
<ContentTemplate>
<asp:TextBox ID="txtCrawl" runat="server" TextMode="SingleLine" Width="60%"></asp:TextBox>
<asp:Button ID="btnCrawling" runat="server" Text="Start Crawling" CssClass="submitButton"
OnClick="btnCrawling_Click" OnClientClick="javascript:ClearOldText();"/>
<p>
<asp:RequiredFieldValidator ID="txtCrawlUrl" runat="server" ErrorMessage="*Required!"
ControlToValidate="txtCrawl" CssClass="failureNotification" Display="Dynamic"></asp:RequiredFieldValidator>
<asp:RegularExpressionValidator ID="txtCrawlUrlR" runat="server" ErrorMessage="Please Enter Valid Url!"
ControlToValidate="txtCrawl" CssClass="failureNotification" ValidationExpression="^(ht|f)tp(s?)\:\/\/[0-9a-zA-Z]([-.\w]*[0-9a-zA-Z])*(:(0-9)*)*(\/?)([a-zA-Z0-9\-\.\?\,\'\/\\\+&amp;%\$#_]*)?$"
Display="Dynamic"></asp:RegularExpressionValidator>
</p>
<p>
&nbsp;</p>
<div id="showExtractConetent" runat="server" class="ExtractContent">
</div>
</ContentTemplate>
</asp:UpdatePanel>
<asp:UpdateProgress ID="UpdateProgress1" runat="server" AssociatedUpdatePanelID="UpdatePanel1"
DisplayAfter="5">
<ProgressTemplate>
<div id="divStatusID" runat="server">
<p class="textEntry">
please wait,processing....</p>
<asp:Image ID="ajaxloading" runat="server" ImageUrl="~/Image/loading.gif" AlternateText="processing...." />
</div>
</ProgressTemplate>
</asp:UpdateProgress>
</div>
<script type="text/javascript">
<!--
function ClearOldText() {

$get("<%=showExtractConetent.ClientID%>").innerHTML = '';
}
-->
</script>
</asp:Content>

代码隐藏文件

    using System;
using System.Collections.Generic;
using System.Linq;
using System.Web;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Net;
using System.IO;
using System.Text.RegularExpressions;
using System.Text;
using DataManager.BOL;
using DataManager.BLL;
using AryaSearch.LuceneSearch;
using HtmlAgilityPack;
using System.Web.UI.HtmlControls;

public partial class _Default : System.Web.UI.Page
{

protected void Page_Load(object sender, EventArgs e)
{

}
protected void btnCrawling_Click(object sender, EventArgs e)
{
showExtractConetent.InnerHtml = string.Empty;
Webcrawling(txtCrawl.Text.Trim());
}

private void Webcrawling(string url)
{
CrawlingData cData = new CrawlingData();
string _requestUrl = url.Trim();
string _encodingType = "utf-8";
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(_requestUrl);
request.AllowAutoRedirect = true;
request.MaximumAutomaticRedirections = 3;
request.UserAgent = "Mozilla/6.0 (MSIE 6.0; Windows NT 5.1; Arya.NET Robot)";
request.KeepAlive = true; request.Timeout = 15 * 1000;

System.Net.HttpWebResponse response = (HttpWebResponse)request.GetResponse();
showExtractConetent.InnerHtml = ((HttpWebResponse)response).StatusDescription;
showExtractConetent.InnerHtml += "<br/>";
_encodingType = GetEncodingType(response);
System.IO.StreamReader reader = new System.IO.StreamReader
(response.GetResponseStream(), Encoding.GetEncoding(_encodingType));
// Read the content.
string responseFromServer = reader.ReadToEnd();
ISet<string> Links = ExtractLinkFromContent(responseFromServer);//gets the links only
//Display the title
string pageTitle = getPageTitle(responseFromServer);
showExtractConetent.InnerHtml = "<h1 class=bold>" + pageTitle + "</h1>";
// Display the content.
//Console.WriteLine(responseFromServer);
showExtractConetent.InnerHtml += StripHtmlTags(responseFromServer);
showExtractConetent.InnerHtml += "<br/>";
//Dispaly the Links of all page.
string urls = string.Empty;
foreach (string l in Links)
{
urls += l.ToString() + "<br/>";


}
showExtractConetent.InnerHtml += "<h1 class=bold>Links of Pages</h1>";
showExtractConetent.InnerHtml += urls;
cData.Url = _requestUrl;
cData.Title = pageTitle;
cData.Contents = StripHtmlTags(responseFromServer);
var dbstatus = DataManagerBLL.InsertCrawlingData(cData);
//Create Lucene Index For This Data.
if (dbstatus>0)
{
cData.Id = (int)dbstatus;
LuceneSearch.AddUpdateLuceneIndex(cData);
}
// Clean up the streams.
reader.Close();
//dataStream.Close();
response.Close();


}

private ISet<string> ExtractLinkFromContent(String content)
{
Regex regexLink = new Regex("(?<=<a\\s*?href=(?:'|\"))[^'\"]*?(?=(?:'|\"))");

ISet<string> newLinks = new HashSet<string>();
foreach (var match in regexLink.Matches(content))
{
if (!newLinks.Contains(match.ToString()))
newLinks.Add(match.ToString());
}

return newLinks;
}
private string GetEncodingType(HttpWebResponse response)
{
string pageCharset = string.Empty;
if (response.ContentType != string.Empty)
{
pageCharset = response.ContentEncoding;
}

return !string.IsNullOrEmpty(pageCharset) ? pageCharset : "utf-8";

}
private string StripHtmlTags(string html)
{
if (String.IsNullOrEmpty(html))
return string.Empty;
HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
doc.OptionFixNestedTags = true;
doc.LoadHtml(html);
doc.DocumentNode.Descendants()
.Where(n => n.Name == "script" || n.Name == "style" || n.Name == "#comment")
.ToList()
.ForEach(n => n.Remove());
//Read the body content only.
HtmlNode.ElementsFlags.Remove("form");
HtmlNode.ElementsFlags.Remove("a");
HtmlNode.ElementsFlags.Remove("img");
HtmlAgilityPack.HtmlNode bodyNode = doc.DocumentNode.SelectSingleNode("//body");
HtmlAgilityPack.HtmlNode formNode = bodyNode.SelectSingleNode("//form");
HtmlNodeCollection tags = bodyNode.SelectNodes("//form | //input | //select | //textarea");
if (tags != null && tags.Count > 0)
{
for (int i = 0; i < tags.Count; i++)
tags.Remove(i);
}
if (bodyNode != null)
{
HtmlGenericControl div = new HtmlGenericControl();
div.InnerText = bodyNode.InnerText;
string tempdivtext = div.InnerText;
//tempdivtext=tempdivtext.Replace(formNode.InnerText,"");
tempdivtext = tempdivtext.Replace("\r\n", string.Empty);
tempdivtext = tempdivtext.Replace("\n", string.Empty);
tempdivtext = tempdivtext.Replace("\t", string.Empty);
return tempdivtext.Trim().ToString();
}

return doc.DocumentNode.InnerText;
}

private string getPageTitle(string html)
{
if (String.IsNullOrEmpty(html))
return string.Empty;
HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
doc.OptionFixNestedTags = true;
doc.LoadHtml(html);
//Read the body content only.
HtmlAgilityPack.HtmlNode titleNode = doc.DocumentNode.SelectSingleNode("//title");
if (titleNode != null)
return titleNode.InnerText;

return doc.DocumentNode.InnerText;

}
}

现在搜索搜索.aspx

    <%@ Page Title="" Language="C#" MasterPageFile="~/Site.master" AutoEventWireup="true"
CodeFile="Search.aspx.cs" Inherits="Search" %>

<asp:Content ID="Content1" ContentPlaceHolderID="HeadContent" runat="Server">
</asp:Content>
<asp:Content ID="Content2" ContentPlaceHolderID="MainContent" runat="Server">
<div class="SearchHeadingText">
<h2 class="customAlignMent">
<font color="red">A</font><font color="blue">R</font><font color="black">Y</font><font
color="green">A</font>
</h2>

<asp:UpdatePanel ID="UpdatePanelForSearch" runat="server" UpdateMode="Conditional">
<ContentTemplate>
<asp:TextBox ID="txtSearch" runat="server" TextMode="SingleLine" Width="70%"></asp:TextBox>
<asp:Button ID="btnAryaSearch" runat="server" Text="Arya Search"
CssClass="submitButton" onclick="btnAryaSearch_Click" />

<p>
&nbsp;</p>
<div id="showExtractConetent" runat="server" class="SearchResult">
</div>
</ContentTemplate>
</asp:UpdatePanel>
<asp:UpdateProgress ID="UpdateProgressSearch" runat="server" AssociatedUpdatePanelID="UpdatePanelForSearch"
DisplayAfter="5">
<ProgressTemplate>
<div id="divStatusID" runat="server">
<p class="textEntry">
please wait,Searching....</p>
<asp:Image ID="ajaxloading" runat="server" ImageUrl="~/Image/loading.gif" AlternateText="processing...." />
</div>
</ProgressTemplate>
</asp:UpdateProgress>
</div>
</asp:Content>

文件 search.aspx.cs 背后的代码

    using System;
using System.Collections.Generic;
using System.Linq;
using System.Web;
using System.Web.UI;
using System.Web.UI.WebControls;
using DataManager.BOL;
using DataManager.BLL;
using System.Text;

public partial class Search : System.Web.UI.Page
{
protected void Page_Load(object sender, EventArgs e)
{

}
protected void btnAryaSearch_Click(object sender, EventArgs e)
{

string input = txtSearch.Text.Trim();
string fieldName = string.Empty;
//fieldName = "Title";
List<CrawlingData> result = DataManagerBLL.searchResult(input, fieldName);
//Build the text format using string builder to dispaly the result
StringBuilder resultText = new StringBuilder();
if (result != null && result.Count > 0)
{
resultText.AppendFormat("<p>About {0} results ({1} seconds)</p>", result[0].Total.ToString(), result[0].Time.ToString());
foreach (CrawlingData s in result)
{
resultText.AppendFormat("<a href='{0}' target='{1}'>{2}</a>", s.Url, "_blank", s.Title);
resultText.AppendFormat("<br/>");
resultText.AppendFormat("<em class=\"customlink\">{0}</em>", s.Url);
resultText.AppendFormat("<br/>");
resultText.AppendFormat("<span class=\"searchContent\"><em>{0}<em></span>", s.Contents);
resultText.AppendFormat("<br/><br/>");

}
showExtractConetent.InnerHtml = resultText.ToString();
}
else
{
showExtractConetent.InnerText = "No results found";
}

}
}

Linq to Sql 业务对象

    using System;
using System.Collections.Generic;
using System.Linq;
using System.Web;
namespace DataManager.BOL
{


public class CrawlingData
{
public Int32 Id { get; set; }
public string Url { get; set; }
public string Title { get; set; }
public string Contents { get; set; }
public long Total { get; set; }
public double Time { get; set; }
}

}

Linq to Sql 业务逻辑(Crud 操作)

    using System;
using System.Collections.Generic;
using System.Linq;
using System.Web;
using DataManager.BOL;
using DataManagerDAL;
using AryaSearch.LuceneSearch;
namespace DataManager.BLL
{
public static class DataManagerBLL
{
public static long InsertCrawlingData(CrawlingData cdata)
{
long lastInsertId = 0;
try
{
using (DataClassesDataContext _db = new DataClassesDataContext())
{


tblCrawlingData dataUpdate = _db.tblCrawlingDatas.Where(c => c.Url.ToLower() == cdata.Url.ToLower()).Take(1).SingleOrDefault();
if (dataUpdate == null)
{
tblCrawlingData data = new tblCrawlingData();
data.Id = cdata.Id;
data.Url = cdata.Url;
data.Title = cdata.Title;
data.Contents = cdata.Contents;
_db.tblCrawlingDatas.InsertOnSubmit(data);
_db.SubmitChanges();
lastInsertId =data.Id;
}
else
{
//data.Id = cdata.Id;
dataUpdate.Url = cdata.Url;
dataUpdate.Title = cdata.Title;
dataUpdate.Contents = cdata.Contents;
_db.SubmitChanges();
lastInsertId = dataUpdate.Id;
}
return lastInsertId;
}
}
catch (Exception ex)
{


return 0;
}
}

public static bool UpdateCrawlingData(CrawlingData cdata)
{
try
{
using (DataClassesDataContext _db = new DataClassesDataContext())
{
//tblCrawlingData data = new tblCrawlingData();
tblCrawlingData data=_db.tblCrawlingDatas.Where(c => c.Id == cdata.Id).Take(1).SingleOrDefault();
if (data == null)
return false;
data.Id = cdata.Id;
data.Url = cdata.Url;
data.Title = cdata.Title;
data.Contents = cdata.Contents;
_db.SubmitChanges();
LuceneSearch.Optimize();
return true;
}
}
catch (Exception ex)
{
return false;
}
}


public static List<CrawlingData> searchResult(string input, string fieldName)
{
LuceneSearch.Optimize();
return LuceneSearch.Search(input, fieldName).ToList();

}


}
}

如果任何 body 有任何问题。请联系我。我非常努力地做这个。所以如果有人从中得到帮助,请投票给我。谢谢

关于asp.net-mvc-3 - 如何使用asp.net mvc3 和c# 构建网络爬虫?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/13365004/

36 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com