gpt4 book ai didi

Java 模式匹配无法找到具有正则表达式 [A-Z0-9._%+-]+@[A-Z0-9.-]{3,65}\.[A-Z]{2,4} 的电子邮件

转载 作者:行者123 更新时间:2023-11-29 04:17:02 25 4
gpt4 key购买 nike

<分区>

我正在尝试从网页中提取电子邮件;我有 60 个随机网站 url 并试图从中提取电子邮件以进行测试,我正在使用此 [A-Z0-9._%+-]+@[A-Z0-9.-]{3,65 }.[A-Z]{2,4} 用于在页面中查找电子邮件的正则表达式,并使用 JSoup 解析网站。

在一个工作源中编辑代码

import java.io.IOException;
import java.net.MalformedURLException;
import java.security.KeyManagementException;
import java.security.NoSuchAlgorithmException;
import java.security.cert.X509Certificate;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.net.ssl.HostnameVerifier;
import javax.net.ssl.HttpsURLConnection;
import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLSession;
import javax.net.ssl.TrustManager;
import javax.net.ssl.X509TrustManager;

import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

public class TestingMail {
// HERE WE CONSTRUCT CLASS
public TestingMail(){}


/****************** SETTING MAIN METHOD TO TEST CLASS *************************/
public static void main(String[] args){
// Setting initiator
String Terms="Trending Bitcoin Investment Chat in NETHERLANDS";
TestingMail extractor=new TestingMail();
extractor.extract(Terms, extractor);
}


/****************** HERE WE CONSTRUCT THE EXTRACT METHOD **********************/
public void extract(String terms, TestingMail extractor){
// HERE WE START CONSTRUCTING THE EXTRACT PROCESSES
int NUM_THREADS=10;
int limit=10;
String[] parseURL={};
String[] crawedURL={};
int istype=0;
int start=0;
// HERE WE START PROCESSING
if(terms!=null && terms.length()>0){
SSLContext sc = null;

// LETS DISABLE SSL CERTIFICATE
// Create a trust manager that does not validate certificate chains
TrustManager[] trustAllCerts = new TrustManager[] {new X509TrustManager() {
public java.security.cert.X509Certificate[] getAcceptedIssuers() {
return null;
}
public void checkClientTrusted(X509Certificate[] certs, String authType) {
}
public void checkServerTrusted(X509Certificate[] certs, String authType) {
}
}
};

try {
sc = SSLContext.getInstance("SSL");
} catch (NoSuchAlgorithmException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
try {
sc.init(null, trustAllCerts, new java.security.SecureRandom());
} catch (KeyManagementException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
HttpsURLConnection.setDefaultSSLSocketFactory(sc.getSocketFactory());

// Create all-trusting host name verifier
HostnameVerifier allHostsValid = new HostnameVerifier() {
public boolean verify(String hostname, SSLSession session) {
return true;
}
};

// Install the all-trusting host verifier
HttpsURLConnection.setDefaultHostnameVerifier(allHostsValid);

// HERE LETS CRAW DATA FROM GOOGLE
crawedURL=new String[]{"https://www.globfinances.com", "https://napoleoninvestment.net", "https://www.meetup.com/BitcoinWednesday/?_cookie-check=PXZ_aLyoOMcdpbrs"};
if(crawedURL!=null && crawedURL.length>0){
// Here we loop mails to store send mails
if(crawedURL.length<limit){
limit=crawedURL.length;
istype=1;
}

// Here we set the mails length
parseURL=new String[limit];
// HERE WE START THREAD POOL
ExecutorService es = Executors.newFixedThreadPool(NUM_THREADS);
List<Future<Integer>> futures = new ArrayList<>(NUM_THREADS);

// Submit task to every thread:
for (int i = 0; i < NUM_THREADS; i++) {
// Here we loop to get mails
if(start<crawedURL.length){
for(int k=start, j=0; j<crawedURL.length; k++, j++){
if(k<(limit-1)){
System.out.println(i+"=="+j);
// System.out.println(mails[k]);
parseURL[j]=crawedURL[k];
}
else{
start+=limit+1;
break;
}
}
// Here we thread task
futures.add(i, es.submit((Callable<Integer>) new Extractor(parseURL, extractor)));
}
else{
istype=1;
break;
}

// Checking thread type to prevent multiple run
if(istype==1){
break;
}
} // end of loop

// Shutdown thread pool
es.shutdown();
System.out.println("Thread: "+futures.size());
}
}
}


/******************* HERE WE CONSTRUCT THE EXTRACT METHOD *******************/
private Integer mailExtract(String[] urls) throws MalformedURLException{
// HERE WE START CONSTRUCTING THE MAIL EXTRACTED PROCESS
String pattern = "\\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b";
Set<String> emails = new HashSet<>();
String[][] extracted={};
int totalMails=0;
// HERE WE START PROCESSING
if(urls!=null && urls.length>0){
extracted=new String[urls.length][];
// Now lets extract mails
Pattern pat = Pattern.compile(pattern);
// Now lets loop
for(int i=0; i<urls.length; i++){
emails=parse(urls[i], pat);
int key=0;
if(emails.size()>0){
for(String email:emails){
extracted[i][key]=email;
key++;
} // end of loop
}
} // end of loop

// HERE WE CHECK EXTRACTED LENGTH
for(int j=0; j<extracted.length; j++){
totalMails=totalMails+extracted[j].length;
} // end of loop

System.out.println(totalMails);
}

// Here we return
return Integer.valueOf(totalMails);
}


/********* HERE WE START CONSTRUCTING THE PARSE FUNCTIONS **********/
public Set<String> parse(String url, Pattern pat){
// HERE WE CONSTRUCT THE EMAIL PARSER PROCESS
Set<String> emailAddresses = new HashSet<>();
boolean found=false;
String contents="";
// HERE WE START PROCESSING
if(url!=null){
contents=urlContent(url);
if(contents.length()>0 && contents.indexOf("body")>=0){
// Pattern pat = Pattern.compile(pattern);
//Matches contents against the given Email Address Pattern
Matcher match = pat.matcher(contents);
found=match.find();
//If match found, append to emailAddresses
System.out.println("I found this: "+found);
while(found) {
emailAddresses.add(match.group());
} // end of while loop
}
}

// Here we return
return emailAddresses;
}


// HERE WE READ URL CONTENT TO STRING
private String urlContent(String url){
// HERE WE CONSTRUCT THE URL CONTENT RETURNER
String content="";
Document doc=null;
String sUrl="";
// HERE WE START PROCESSING
try {
SSLContext sc = null;

// LETS DISABLE SSL CERTIFICATE
// Create a trust manager that does not validate certificate chains
TrustManager[] trustAllCerts = new TrustManager[] {new X509TrustManager() {
public java.security.cert.X509Certificate[] getAcceptedIssuers() {
return null;
}
public void checkClientTrusted(X509Certificate[] certs, String authType) {
}
public void checkServerTrusted(X509Certificate[] certs, String authType) {
}
}
};

try {
sc = SSLContext.getInstance("SSL");
} catch (NoSuchAlgorithmException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
try {
sc.init(null, trustAllCerts, new java.security.SecureRandom());
} catch (KeyManagementException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
HttpsURLConnection.setDefaultSSLSocketFactory(sc.getSocketFactory());

// Create all-trusting host name verifier
HostnameVerifier allHostsValid = new HostnameVerifier() {
public boolean verify(String hostname, SSLSession session) {
return true;
}
};

// Install the all-trusting host verifier
HttpsURLConnection.setDefaultHostnameVerifier(allHostsValid);

// HERE WE START CRAWLING
if(url.startsWith("http")){
Connection con=Jsoup.connect(url).timeout(100000).ignoreHttpErrors(true).followRedirects(true).userAgent("Mozilla/5.0(compactible;Googlebot/2.1;+http://www.google.com/bot.html)");
Connection.Response resp = con.execute();
// HERE WE CHECK RESPONSE CODE
if (resp.statusCode() == 200) {
doc = con.get();
// Now lets get the text document
content=doc.html();
} // End of status check
else if(resp.statusCode() == 307){
String sNewUrl = resp.header("Location");
if (sNewUrl != null && sNewUrl.length() > 7)
sUrl = sNewUrl;
resp = Jsoup.connect(sUrl).timeout(100000).ignoreHttpErrors(true).userAgent("Mozilla/5.0(compactible;Googlebot/2.1;+http://www.google.com/bot.html)").execute();
doc =resp.parse();
// Now lets get the text document
content=doc.html();
} // End of status 307 check
} // end of start with check
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
// Here we return
return content;
}



/************* HERE WE CONSTRUCT INNER CLASS TO HANDLE THREAD *****************/
public static final class Extractor implements Callable<Integer>{
// HERE WE CONSTRUCT CLASS
String[] Urls;
TestingMail Extract;
public Extractor(String[] urls, TestingMail extract){
Urls=urls;
Extract=extract;
}

/*********** HERE WE CALL THE CALLABLE ***********/
@Override
public Integer call() throws Exception {
try {
return Extract.mailExtract(Urls);
} catch (Throwable t) {
t.printStackTrace();
throw new RuntimeException(t);
}
}

// END OF CLASS
}

// END OF CLASS
}

我添加了一些打印语句来监控进程,但我一直得到的都是错误的 java 模式匹配

这是我控制台中的内容

52
0==0
0==1
0==2
0==3
0==4
0==5
0==6
0==7
0==8
Thread: 5 Extracted Mails: 0
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
java.lang.NullPointerException
at system.soft.processor.MailExtractor.mailExtract(MailExtractor.java:202)
at system.soft.processor.MailExtractor.access$0(MailExtractor.java:172)
at system.soft.processor.MailExtractor$Extractor.call(MailExtractor.java:239)
at system.soft.processor.MailExtractor$Extractor.call(MailExtractor.java:1)
at java.util.concurrent.FutureTask.run(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
at java.lang.Thread.run(Unknown Source)
java.lang.NullPointerException
at system.soft.processor.MailExtractor.mailExtract(MailExtractor.java:202)
at system.soft.processor.MailExtractor.access$0(MailExtractor.java:172)
at system.soft.processor.MailExtractor$Extractor.call(MailExtractor.java:239)
at system.soft.processor.MailExtractor$Extractor.call(MailExtractor.java:1)
at java.util.concurrent.FutureTask.run(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
at java.lang.Thread.run(Unknown Source)
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
java.lang.NullPointerException
at system.soft.processor.MailExtractor.mailExtract(MailExtractor.java:202)
at system.soft.processor.MailExtractor.access$0(MailExtractor.java:172)
at system.soft.processor.MailExtractor$Extractor.call(MailExtractor.java:239)
at system.soft.processor.MailExtractor$Extractor.call(MailExtractor.java:1)
at java.util.concurrent.FutureTask.run(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
at java.lang.Thread.run(Unknown Source)
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
java.lang.NullPointerException
at system.soft.processor.MailExtractor.mailExtract(MailExtractor.java:202)
at system.soft.processor.MailExtractor.access$0(MailExtractor.java:172)
at system.soft.processor.MailExtractor$Extractor.call(MailExtractor.java:239)
at system.soft.processor.MailExtractor$Extractor.call(MailExtractor.java:1)
at java.util.concurrent.FutureTask.run(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
at java.lang.Thread.run(Unknown Source)
I found this: false with expression: \\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z0-9.-]+\\b
java.lang.NullPointerException
at system.soft.processor.MailExtractor.mailExtract(MailExtractor.java:202)
at system.soft.processor.MailExtractor.access$0(MailExtractor.java:172)
at system.soft.processor.MailExtractor$Extractor.call(MailExtractor.java:239)
at system.soft.processor.MailExtractor$Extractor.call(MailExtractor.java:1)
at java.util.concurrent.FutureTask.run(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
at java.lang.Thread.run(Unknown Source)

我似乎无法理解为什么收不到电子邮件,至少上述网站之一在页脚中包含一封支持电子邮件,但我的代码似乎无法收到。我什至将表达方式更改为:\b[a-zA-Z0-9.-]+@[a-zA-Z0-9.-]+\.[a-zA-Z0-9.- ]+\b 但结果相同。我不知道缺少什么或为什么表达式不起作用。

任何帮助将不胜感激

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com