From 614027dec0872112fb957a8fbc0c289aac3f3fa8 Mon Sep 17 00:00:00 2001 From: wrzzx <863076034@qq.com> Date: Thu, 16 Feb 2017 11:08:00 +0800 Subject: [PATCH] fix... --- .../fetch_networks/.idea/workspace.xml | 218 +--- new_osseanextractor/.idea/workspace.xml | 1160 ++++++++++++++--- .../net/trustie/dao/OpenHubProject_Dao.java | 2 +- .../java/net/trustie/model/openhub_Model.java | 14 +- .../java/net/trustie/one/ExtractThread.java | 9 +- .../trustie/core/GithubRepoPageProcessor.java | 61 + .../test/java/net/trustie/core/URLCheck.java | 19 + 7 files changed, 1135 insertions(+), 348 deletions(-) create mode 100644 new_osseanextractor/src/test/java/net/trustie/core/GithubRepoPageProcessor.java create mode 100644 new_osseanextractor/src/test/java/net/trustie/core/URLCheck.java diff --git a/crawler/moreSmarterCrawler/fetch_networks/.idea/workspace.xml b/crawler/moreSmarterCrawler/fetch_networks/.idea/workspace.xml index ac17b45..0518fc6 100644 --- a/crawler/moreSmarterCrawler/fetch_networks/.idea/workspace.xml +++ b/crawler/moreSmarterCrawler/fetch_networks/.idea/workspace.xml @@ -2,51 +2,11 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + @@ -181,7 +144,7 @@ - + @@ -194,119 +157,9 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + @@ -319,7 +172,7 @@ - + - + - - + @@ -770,11 +623,12 @@ + - @@ -787,36 +641,36 @@ - + + - + - + - + - @@ -848,10 +702,30 @@ + + + + + + + + + + + + + + + + + + + + @@ -958,9 +832,11 @@ - - - + + + + + diff --git a/new_osseanextractor/.idea/workspace.xml b/new_osseanextractor/.idea/workspace.xml index 33739d1..cc4173e 100644 --- a/new_osseanextractor/.idea/workspace.xml +++ b/new_osseanextractor/.idea/workspace.xml @@ -26,33 +26,7 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - + @@ -60,22 +34,15 @@ - - - - - - - - - - gethome gethomepage + readSitesFromConfig + tags + description @@ -90,8 +57,11 @@ @@ -115,6 +85,70 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -647,6 +585,28 @@ + + + + + + @@ -752,14 +712,16 @@ + + - - + @@ -770,22 +732,22 @@ - + + - - + - + @@ -825,8 +787,310 @@ - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -836,8 +1100,245 @@ - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -847,7 +1348,8 @@ - + + @@ -862,14 +1364,6 @@ - - - - - - - - @@ -900,34 +1394,372 @@ + + + + + + + + + + + + + + - + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - + + - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + - - + diff --git a/new_osseanextractor/src/main/java/net/trustie/dao/OpenHubProject_Dao.java b/new_osseanextractor/src/main/java/net/trustie/dao/OpenHubProject_Dao.java index 038e775..207a02f 100644 --- a/new_osseanextractor/src/main/java/net/trustie/dao/OpenHubProject_Dao.java +++ b/new_osseanextractor/src/main/java/net/trustie/dao/OpenHubProject_Dao.java @@ -25,7 +25,7 @@ public interface OpenHubProject_Dao { - @Insert("insert into openhub_project_test" + @Insert("insert into openhub_project" +"(`url_md5`,`source`,`name`,`url`,`user_num`,`description`,`tags`,`organization`," + "`license`,`similar_project`,`manager`," + "`language`,`active_degree`," diff --git a/new_osseanextractor/src/main/java/net/trustie/model/openhub_Model.java b/new_osseanextractor/src/main/java/net/trustie/model/openhub_Model.java index a6f896e..33f46bc 100644 --- a/new_osseanextractor/src/main/java/net/trustie/model/openhub_Model.java +++ b/new_osseanextractor/src/main/java/net/trustie/model/openhub_Model.java @@ -49,12 +49,9 @@ public class openhub_Model implements AfterExtractor, ValidateExtractor { // "| //*[@id='projects_show_page']/div[2]/div[3]/div[2]/div/*/*/a/regex(\".*Homepage\",1)/@href" //jquery multi links // +"| //*[@id='projects_show_page']/div[2]/div[4]/div[2]/div/*/*/a/regex(\".*Homepage\",1)/@href " //) - @ExtractBy(value="//a/regex(\"\",1) " ) - /*+ + @ExtractBy(value="//a/regex(\"\",1) " + " | //a/regex(\"\",1)" + " | //a/regex(\"\",1)") - */ - private List homepages = new ArrayList(); private static String homepage =""; @@ -426,12 +423,9 @@ private List homepages = new ArrayList(); this.rateLevel = this.rateLevel.substring(0, this.rateLevel.indexOf("/")); if (StringHandler.isAtLeastOneBlank(this.name, this.activity - /* - ,this.description - */ - /* + ,this. + description ,getHomepage() - */ /* , * this.licenses */)) { @@ -1280,7 +1274,7 @@ private List homepages = new ArrayList(); public String getValidHomepage(String homePage){ String result = homePage; - //目前的Homepage抽取规则,以“/”开头的如"/p/jQuery"均是多链接的情形 + //目前的Homepage抽取规则,已“/”开头的如"/p/jQuery"均是多链接的情形 if(!homePage.equals("") && homePage.startsWith("/")){ //获取Homepage列表中的Homepage result = new ExtractMutilLink4Openhub().extractLinks(homePage); diff --git a/new_osseanextractor/src/main/java/net/trustie/one/ExtractThread.java b/new_osseanextractor/src/main/java/net/trustie/one/ExtractThread.java index e85e8a2..e0d9c27 100644 --- a/new_osseanextractor/src/main/java/net/trustie/one/ExtractThread.java +++ b/new_osseanextractor/src/main/java/net/trustie/one/ExtractThread.java @@ -87,14 +87,19 @@ public class ExtractThread implements Runnable{ for( RawPage page : pages){ try{ + long startTime=System.currentTimeMillis(); //获取开始时间 result = extractor.extract(page,pageModel); + long endTime=System.currentTimeMillis(); //获取结束时间 + System.out.println("页面抽取时间: "+(endTime-startTime)+"ms"); + startTime=System.currentTimeMillis(); //获取开始时间 //持久化 并 更新抽取历史 saveResult(site,result); - + endTime=System.currentTimeMillis(); //获取结束时间 + System.out.println("结果保存时间: "+(endTime-startTime)+"ms"); }catch (Exception e){ e.printStackTrace(); - pageErrorOutPut.returnErrorPage(page, e); +// pageErrorOutPut.returnErrorPage(page, e);错误页面 } } updateLastId(site,lastId + pages.size()); diff --git a/new_osseanextractor/src/test/java/net/trustie/core/GithubRepoPageProcessor.java b/new_osseanextractor/src/test/java/net/trustie/core/GithubRepoPageProcessor.java new file mode 100644 index 0000000..6cd7e61 --- /dev/null +++ b/new_osseanextractor/src/test/java/net/trustie/core/GithubRepoPageProcessor.java @@ -0,0 +1,61 @@ +package net.trustie.core; +import net.trustie.utils.ExtractMutilLink4Openhub; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.model.OOSpider; +import us.codecraft.webmagic.model.annotation.ExtractBy; +import us.codecraft.webmagic.processor.PageProcessor; +import us.codecraft.webmagic.selector.Selectable; +import java.util.regex.Matcher; + +import java.util.List; + +public class GithubRepoPageProcessor implements PageProcessor { + + private Site site = Site.me().setRetryTimes(3).setSleepTime(1000); + + + private List homepage; + + @Override + public void process(Page page) { + page.putField("links",page.getHtml().xpath("//div[@class='links center-block']/div[1]/div[2]/div/h5/a/@href")); + homepage = page.getHtml().xpath("//div[@class='links center-block']/div[1]/div[2]/div/h5/a").links().all(); + } + + @Override + public Site getSite() { + return site; + } +// public void setHomepage(String homepage){ +// homepage = homepage; +// } + public List getHomepage(){ + return this.homepage; + } + + public static String extractHomepage(String url){ + String homePage = ""; + GithubRepoPageProcessor githubRepoPageProcessor = new GithubRepoPageProcessor(); + Spider.create(githubRepoPageProcessor).addUrl("https://www.openhub.net/p/zimbra/links").thread(5).run(); + if(!githubRepoPageProcessor.getHomepage().isEmpty()){ + for(String one : githubRepoPageProcessor.getHomepage()){ + homePage += one+";"; + } + homePage = homePage.substring(0,homePage.lastIndexOf(";")); + } + return homePage; + } + + public static void main(String[] args) { + GithubRepoPageProcessor githubRepoPageProcessor = new GithubRepoPageProcessor(); + String url = "/p/jQuery/links"; + Spider.create(githubRepoPageProcessor).addUrl(new String("https://www.openhub.net"+url)).thread(5).run(); + System.out.print(githubRepoPageProcessor.getHomepage().get(0)); +// String hompage = "/p/bac"; +// System.out.println(hompage.startsWith("/")); + String result = new ExtractMutilLink4Openhub().extractLink("/p/jQuery/links"); + System.out.println(result); + } +} diff --git a/new_osseanextractor/src/test/java/net/trustie/core/URLCheck.java b/new_osseanextractor/src/test/java/net/trustie/core/URLCheck.java new file mode 100644 index 0000000..d7a24e6 --- /dev/null +++ b/new_osseanextractor/src/test/java/net/trustie/core/URLCheck.java @@ -0,0 +1,19 @@ +package net.trustie.core; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Created by LouAnt on 2017/2/7 0007 + * Description: + */ +public class URLCheck { + public static void main(String[] args){ + String check ="^([hH][tT]{2}[pP]://|[hH][tT]{2}[pP][sS]://)(([a-zA-Z0-9\\._-]+\\.[a-zA-Z]{2,6})|([0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}))(:[0-9]{1,4})*(/[a-zA-Z0-9\\&%_\\./-~-]*)?"; + Pattern p = Pattern.compile(check,Pattern.CASE_INSENSITIVE); + String url = "http://www.leniel.net/2013/04/manage-folders-and-files-in-your-aspnet-mvc-project-with-elfinder-net-file-manager.html"; + Matcher m = p.matcher(url); + boolean res = m.matches(); + System.out.println(res); + } +}