gpt4 book ai didi

java - apache HttpClient、基于表单的登录和检索 HTML 内容

转载 作者:行者123 更新时间:2023-11-30 07:27:25 25 4
gpt4 key购买 nike

我想使用 apache HttpClient api 登录网站:rfp.ca 并检索其 html 内容,然后是我的代码。

我运行后只返回了登录页面的源代码:http://www.rfp.ca/login/它仍然要求提供凭据,我期待它被重定向到 http://www.rfp.ca/my_account/就像使用浏览器登录一样。

关于如何实现这个的任何想法?

提前致谢。

埃里克

import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.cookie.Cookie;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.protocol.HTTP;



public class HttpGetter {



public static void main(String[] args) throws Exception {

DefaultHttpClient httpclient = new DefaultHttpClient();

HttpGet httpget = new HttpGet("http://www.rfp.ca/login/");

HttpResponse response = httpclient.execute(httpget);
HttpEntity entity = response.getEntity();

System.out.println("Login form get: " + response.getStatusLine());
if (entity != null) {
entity.consumeContent();
}
System.out.println("Initial set of cookies:");
List<Cookie> cookies = httpclient.getCookieStore().getCookies();
if (cookies.isEmpty()) {
System.out.println("None");
} else {
for (int i = 0; i < cookies.size(); i++) {
System.out.println("- " + cookies.get(i).toString());
}
}

HttpPost httpost = new HttpPost("http://www.rfp.ca/login/");

List <NameValuePair> nvps = new ArrayList <NameValuePair>();
nvps.add(new BasicNameValuePair("username", "myusername"));
nvps.add(new BasicNameValuePair("password", "mypassword"));

httpost.setEntity(new UrlEncodedFormEntity(nvps, HTTP.UTF_8));

response = httpclient.execute(httpost);

System.out.println("Response "+response.toString());
entity = response.getEntity();

System.out.println("Login form get: " + response.getStatusLine());
if (entity != null) {

InputStream is = entity.getContent();
BufferedReader br = new BufferedReader(new InputStreamReader(is));
String str ="";
while ((str = br.readLine()) != null){
System.out.println(""+str);
}
}

System.out.println("Post logon cookies:");
cookies = httpclient.getCookieStore().getCookies();
if (cookies.isEmpty()) {
System.out.println("None");
} else {
for (int i = 0; i < cookies.size(); i++) {
System.out.println("- " + cookies.get(i).toString());
}
}
httpclient.getConnectionManager().shutdown();

}
}

返回结果如下:

    Login form get: HTTP/1.1 200 OK
Initial set of cookies:
- [version: 0][name: PHPSESSID][value: f4dc36acc705b31b15b4ea07a398a60b][domain: www.rfp.ca][path: /][expiry: null]
Response HTTP/1.1 200 OK [Date: Sat, 17 Mar 2012 04:04:49 GMT, Server: Apache/2.2.22 (Unix) mod_ssl/2.2.22 OpenSSL/1.0.0-fips mod_auth_passthrough/2.1 mod_bwlimited/1.4 FrontPage/5.0.2.2635, X-Powered-By: PHP/5.2.17, Expires: Thu, 19 Nov 1981 08:52:00 GMT, Cache-Control: no-store, no-cache, must-revalidate, post-check=0, pre-check=0, Pragma: no-cache, Vary: Accept-Encoding,User-Agent, Connection: close, Transfer-Encoding: chunked, Content-Type: text/html;charset=utf-8]
Login form get: HTTP/1.1 200 OK
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"

"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">

<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en-US" lang="en-US">

<head>

<meta name="keywords" content="" />

<meta http-equiv="Content-Type" content="text/html charset=utf-8"/>

<title>RFP.ca: User login form</title>

<link rel="StyleSheet" type="text/css" href="http://www.rfp.ca/templates/_system/main/images/css/form.css" />

<link rel="StyleSheet" type="text/css" href="http://www.rfp.ca/templates/rfp/main/images/design.css" />


<link rel="shortcut icon" href="/favicon.ico" type="image/x-icon">

<link rel="icon" href="/favicon.ico" type="image/x-icon">

<link rel="stylesheet" href="http://www.rfp.ca/system/lib/rating/style.css" type="text/css" />

<link rel="StyleSheet" type="text/css" href="http://www.rfp.ca/system/ext/jquery/css/jquery-ui.css" />

<script language="JavaScript" type="text/javascript" src="http://www.rfp.ca/system/ext/jquery/jquery.js"></script>

<script language="JavaScript" type="text/javascript" src="http://www.rfp.ca/system/ext/jquery/jquery-ui.js"></script>

<script language="JavaScript" type="text/javascript" src="http://www.rfp.ca/system/ext/jquery/jquery.validate.min.js"></script>






<style type="text/css">

*html img,

*html.png

{

azimuth: expression(

this.pngSet?

this.pngSet=true :

(this.nodeName == "IMG" ?

(this.src.toLowerCase().indexOf('.png')>-1 ?

(this.runtimeStyle.backgroundImage = "none", this.runtimeStyle.filter = "progid:DXImageTransform.Microsoft.AlphaImageLoader(src='" + this.src + "', sizingMethod='image')",

this.src = "http://www.rfp.ca/templates/rfp/main/images/blank.gif") :

'') :

(this.currentStyle.backgroundImage.toLowerCase().indexOf('.png')>-1) ?

(this.origBg = (this.origBg) ?

this.origBg :

this.currentStyle.backgroundImage.toString().replace('url("','').replace('")',''),

this.runtimeStyle.filter = "progid:DXImageTransform.Microsoft.AlphaImageLoader(src='" + this.origBg + "', sizingMethod='crop')",

this.runtimeStyle.backgroundImage = "none") :

''

), this.pngSet=true

);

}

</style>



<script type="text/javascript">

var _gaq = _gaq || [];
_gaq.push(['_setAccount', 'UA-254707-12']);
_gaq.push(['_trackPageview']);

(function() {
var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
})();

</script>

</head>

<body>

<div id="messageBox"></div>

<div class="MainDiv">
<div class="headerPage">
<div class="logo">
<div class="png"></div>
<a href="http://www.rfp.ca/"><img src="http://www.rfp.ca/templates/rfp/main/images/logo.png" border="0" alt="" title="" /></a>
</div>
<div class="userMenu">
<a href="http://www.rfp.ca/" title="RFP Home"> Home</a> &nbsp; &nbsp; <img src="http://www.rfp.ca/templates/rfp/main/images/sepDot.png" border="0" alt="" /> &nbsp; &nbsp;
<a href="http://www.rfp.ca/find_rfps/" title="Search">Search</a> &nbsp; &nbsp; <img src="http://www.rfp.ca/templates/rfp/main/images/sepDot.png" border="0" alt="" /> &nbsp; &nbsp;
<a href="http://www.rfp.ca/rfp_alerts/?action=new" title="E-mail Alert">E-mail Alert</a> &nbsp; &nbsp; <img src="http://www.rfp.ca/templates/rfp/main/images/sepDot.png" border="0" alt="" /> &nbsp; &nbsp;
<a href="http://www.rfp.ca/contact/" title="Contact">Contact</a> &nbsp; &nbsp; <img src="http://www.rfp.ca/templates/rfp/main/images/sepDot.png" border="0" alt="" /> &nbsp; &nbsp;
<a href="http://www.rfp.ca/login/" title="Sign In"> Sign In</a>
<br/><br/>
<!--
<form id="langSwitcherForm" method="get" action="">
<select name="lang" onchange="location.href='http://www.rfp.ca/login/?lang='+this.value+'&amp;'" style="width: 200px;">
<option value="de">Deutsch</option>
<option value="tr">Türkçe</option>
<option value="ps">پښتو</option>
<option value="fr">Français</option>
<option value="ar">العربية</option>
<option value="en" selected="selected">English</option>
<option value="fa">فارسی</option>
<option value="ja">日本語</option>
<option value="es">Español</option>
<option value="nl">Nederlands</option>
<option value="ru">–†—É—Å—Å–∫–∏–π</option>
<option value="pt">Português</option>
</select>
</form>
-->

</div>
</div>
<div class="clr"><br /></div>

<div class="indexDiv" >







<h1>Sign In</h1>


<p style="color:#9B9B9B"><i>Tip: Username is your e-mail address</i></p>

<form action="http://www.rfp.ca/login/" method="post" id="loginForm" >

<input type="hidden" name="return_url" value="" />

<input type="hidden" name="action" value="login" />


<fieldset>

<div class="inputName">Username</div>

<div class="inputField"><input type="text" class="logInNameInput" name="username" /></div>

</fieldset>

<fieldset>

<div class="inputName">Password</div>

<div class="inputField"><input class="logInPassInput2" type="password" name="password" /></div>

</fieldset>

<fieldset>

<div class="inputName">&nbsp;</div>

<div class="inputField"><input type="checkbox" name="keep" /> Keep me signed in</div>

</fieldset>

<fieldset>

<div class="inputName">&nbsp;</div>

<div class="inputField"><input type="submit" value="Login" class="button" /></div>

</fieldset>

</form>

<br/>

<a href="http://www.rfp.ca/password_recovery/">Forgot Your Password?</a>&nbsp;|&nbsp; <a href="http://www.rfp.ca/registration/">Subscription</a>



</div>

<div id="grayBgBanner"></div>

<div class="clr"><br /></div>
<div class="bottomMenu">
<a href="http://www.rfp.ca/">Home</a> <img src="http://www.rfp.ca/templates/rfp/main/images/sepDot.png" border="0" alt="">
<a href="http://www.rfp.ca/faq/"> About Us/FAQ</a> <img src="http://www.rfp.ca/templates/rfp/main/images/sepDot.png" border="0" alt="">
<a href="http://www.rfp.ca/features/"> Features</a> <img src="http://www.rfp.ca/templates/rfp/main/images/sepDot.png" border="0" alt="">
<a href="http://www.rfp.ca/contact/" >Contact</a> <img src="http://www.rfp.ca/templates/rfp/main/images/sepDot.png" border="0" alt="">
<a href="http://www.rfp.ca/privacy_policy/">Privacy Policy</a> <img src="http://www.rfp.ca/templates/rfp/main/images/sepDot.png" border="0" alt="">
<a href="http://www.rfp.ca/terms_of_use/">Terms of use</a> <img src="http://www.rfp.ca/templates/rfp/main/images/sepDot.png" border="0" alt="">
&nbsp;&copy; 2012 Organized Media &nbsp;<img src="http://www.rfp.ca/templates/rfp/main/images/sepDot.png" border="0" alt="">
<a href="http://www.twitter.com/rfpca" rel="me" target="_blank"><img src="http://www.rfp.ca/templates/rfp/main/images/twitter.gif" border="0" alt="Twitter"></a><a href="http://www.facebook.com/pages/RFPca/164233376967738" rel="me" target="_blank"><img src="http://www.rfp.ca/templates/rfp/main/images/facebook.gif" border="0" alt="Facebook"></a>
</div>
</div>
<div class="Footer">
</div>
</body>

</html>

Post logon cookies:
- [version: 0][name: PHPSESSID][value: f4dc36acc705b31b15b4ea07a398a60b][domain: www.rfp.ca][path: /][expiry: null]

我尝试添加这两个参数:

nvps.add(new BasicNameValuePair("return_url", "http://www.rfp.ca/my_account/"));
nvps.add(new BasicNameValuePair("action", "login"));

结果出现了一些错误:

Login form get: HTTP/1.1 200 OK
Initial set of cookies:
- [version: 0][name: PHPSESSID][value: e76f3b507a3db64cf1d4ad2297fb0c58][domain: www.rfp.ca][path: /][expiry: null]
Exception in thread "main" org.apache.http.client.ClientProtocolException
at org.apache.http.impl.client.AbstractHttpClient.execute(AbstractHttpClient.java:822)
at org.apache.http.impl.client.AbstractHttpClient.execute(AbstractHttpClient.java:754)
at org.apache.http.impl.client.AbstractHttpClient.execute(AbstractHttpClient.java:732)
at Crawler.HttpGetter.main(HttpGetter.java:203)
Caused by: org.apache.http.ProtocolException: Invalid redirect URI: ?Ûiÿü0·éq¯æɧ¢éí
at org.apache.http.impl.client.DefaultRedirectStrategy.createLocationURI(DefaultRedirectStrategy.java:185)
at org.apache.http.impl.client.DefaultRedirectStrategy.getLocationURI(DefaultRedirectStrategy.java:116)
at org.apache.http.impl.client.DefaultRedirectStrategy.getRedirect(DefaultRedirectStrategy.java:193)
at org.apache.http.impl.client.DefaultRequestDirector.handleResponse(DefaultRequestDirector.java:1035)
at org.apache.http.impl.client.DefaultRequestDirector.execute(DefaultRequestDirector.java:492)
at org.apache.http.impl.client.AbstractHttpClient.execute(AbstractHttpClient.java:820)
... 3 more
Caused by: java.net.URISyntaxException: Illegal character in path at index 0: ?Ûiÿü0·éq¯æɧ¢éí
at java.net.URI$Parser.fail(URI.java:2809)
at java.net.URI$Parser.checkChars(URI.java:2982)
at java.net.URI$Parser.parseHierarchical(URI.java:3066)
at java.net.URI$Parser.parse(URI.java:3024)
at java.net.URI.<init>(URI.java:578)
at org.apache.http.impl.client.DefaultRedirectStrategy.createLocationURI(DefaultRedirectStrategy.java:183)
... 8 more

最佳答案

我不确定这一点,但从我在表格中看到的情况来看,它需要以下参数

return_url:
action:login
username:myusername
password:mypassword

并且您没有在 POST 请求中提供前两个。

更新:在这种情况下获取正确参数的最佳方法是在浏览器中打开 URL,然后在 Firebug 或开发人员工具 (WebKit) 中监控网络 Activity 。它会准确地向您显示您需要以编程方式发送的内容。

关于java - apache HttpClient、基于表单的登录和检索 HTML 内容,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/9747198/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com