diff --git a/crawler/moreSmarterCrawler/fetch_networks/.idea/workspace.xml b/crawler/moreSmarterCrawler/fetch_networks/.idea/workspace.xml
index 0518fc6..2833dd4 100644
--- a/crawler/moreSmarterCrawler/fetch_networks/.idea/workspace.xml
+++ b/crawler/moreSmarterCrawler/fetch_networks/.idea/workspace.xml
@@ -4,9 +4,13 @@
-
+
+
+
+
+
@@ -33,28 +37,48 @@
-
-
+
+
-
-
+
+
-
-
+
+
-
-
+
+
+
+
+
+
+
+
+
+
+
+
-
+
+
+
+
+
+
+
+
+
+
+
@@ -144,7 +168,6 @@
-
@@ -157,10 +180,241 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
@@ -624,11 +878,14 @@
+
+
+
-
+
@@ -641,12 +898,11 @@
-
+
-
@@ -659,7 +915,7 @@
-
+
@@ -671,6 +927,7 @@
+
@@ -694,7 +951,9 @@
-
+
+
+
@@ -702,9 +961,7 @@
-
-
-
+
@@ -712,7 +969,9 @@
-
+
+
+
@@ -720,8 +979,70 @@
+
+
+
+
+
+
+
+
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
@@ -730,7 +1051,9 @@
-
+
+
+
@@ -738,7 +1061,9 @@
-
+
+
+
@@ -746,7 +1071,9 @@
-
+
+
+
@@ -754,7 +1081,9 @@
-
+
+
+
@@ -762,15 +1091,9 @@
-
-
-
-
-
-
-
-
-
+
+
+
@@ -786,11 +1109,7 @@
-
-
-
-
-
+
@@ -802,18 +1121,6 @@
-
-
-
-
-
-
-
-
-
-
-
-
@@ -826,19 +1133,100 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/new_osseanextractor/.idea/workspace.xml b/new_osseanextractor/.idea/workspace.xml
index cf7385e..665ec28 100644
--- a/new_osseanextractor/.idea/workspace.xml
+++ b/new_osseanextractor/.idea/workspace.xml
@@ -22,70 +22,75 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
gethome
gethomepage
readSitesFromConfig
- valid
tags
description
抽取时间
抽取
+ errorpage
+ after
+ lastID
+ getPages
+ valid
+ getValidHomepage
+ validate
@@ -100,13 +105,21 @@
-
+
+
+
+
-
+
+
+
+
+
+
@@ -219,8 +232,9 @@
-
+
+
@@ -272,6 +286,104 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
@@ -303,6 +415,36 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
@@ -372,6 +514,10 @@
+
+
+
+
@@ -393,7 +539,6 @@
-
@@ -419,6 +564,8 @@
+
+
@@ -906,48 +1053,59 @@
-
+
+
+
-
+
+
+
+
+
+
+
+
+
+
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
-
-
+
-
-
@@ -1001,25 +1159,13 @@
- file://$PROJECT_DIR$/src/main/java/net/trustie/one/OpenhubReExtractor.java
- 126
+ file://$PROJECT_DIR$/src/main/java/net/trustie/model/openhub_Model.java
+ 1306
-
-
-
- file://$PROJECT_DIR$/src/main/java/net/trustie/one/OpenhubReExtractor.java
- 94
-
-
-
-
- file://$PROJECT_DIR$/src/main/java/net/trustie/one/ExtractThread.java
- 100
-
-
+
-
+
@@ -1028,67 +1174,15 @@
-
+
-
-
-
-
-
-
-
-
-
-
-
-
+
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
@@ -1096,131 +1190,6 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
@@ -1229,53 +1198,10 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
@@ -1284,80 +1210,392 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
-
-
-
-
-
-
-
-
-
+
+
-
+
-
-
+
+
-
+
-
-
+
+
-
+
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/new_osseanextractor/osseanextractor.iml b/new_osseanextractor/osseanextractor.iml
index dbab3e2..1bbfc86 100644
--- a/new_osseanextractor/osseanextractor.iml
+++ b/new_osseanextractor/osseanextractor.iml
@@ -11,7 +11,6 @@
-
diff --git a/new_osseanextractor/pom.xml b/new_osseanextractor/pom.xml
index dd0db06..eed203f 100644
--- a/new_osseanextractor/pom.xml
+++ b/new_osseanextractor/pom.xml
@@ -82,6 +82,5 @@
commons-io
2.4
-
diff --git a/new_osseanextractor/src/main/java/net/trustie/dao/OpenHubRetry_Dao.java b/new_osseanextractor/src/main/java/net/trustie/dao/OpenHubRetry_Dao.java
new file mode 100644
index 0000000..98b4aed
--- /dev/null
+++ b/new_osseanextractor/src/main/java/net/trustie/dao/OpenHubRetry_Dao.java
@@ -0,0 +1,46 @@
+package net.trustie.dao;
+
+import net.trustie.model.openhub_retry_Model;
+
+import java.sql.Connection;
+import java.sql.DriverManager;
+import java.sql.PreparedStatement;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.sql.Statement;
+
+/**
+ * Created by zaihuilvcha on 2017/2/18.
+ */
+public class OpenHubRetry_Dao {
+ private Connection conn = null;
+ private Statement stmt = null;
+
+ public OpenHubRetry_Dao() {
+ try {
+ Class.forName("com.mysql.jdbc.Driver");
+ String url = "jdbc:mysql://localhost:3306/extract_result?user=root&password=123456";
+ conn = DriverManager.getConnection(url);
+ stmt = conn.createStatement();
+ } catch (ClassNotFoundException e) {
+ e.printStackTrace();
+ } catch (SQLException e) {
+ e.printStackTrace();
+ }
+
+ }
+
+ public int add(openhub_retry_Model oprm) {
+ try {
+ String sql = "INSERT INTO `extract_result`.`openhub_download_fail` (`url`, `html`) VALUES (?, ?);";
+ PreparedStatement ps = conn.prepareStatement(sql);
+ ps.setString(1, oprm.getUrl());
+ ps.setString(2,oprm.getHtml());
+ return ps.executeUpdate();
+ } catch (SQLException e) {
+ e.printStackTrace();
+ }
+ return -1;
+ }
+
+}
diff --git a/new_osseanextractor/src/main/java/net/trustie/model/openhub_Model.java b/new_osseanextractor/src/main/java/net/trustie/model/openhub_Model.java
index 0b015f4..7c19351 100644
--- a/new_osseanextractor/src/main/java/net/trustie/model/openhub_Model.java
+++ b/new_osseanextractor/src/main/java/net/trustie/model/openhub_Model.java
@@ -6,7 +6,8 @@ import java.util.ArrayList;
import java.util.Date;
import java.util.List;
-import net.trustie.utils.DateHandler;
+import core.*;
+import net.trustie.dao.OpenHubRetry_Dao;
import net.trustie.utils.ExtractMutilLink4Openhub;
import net.trustie.utils.Seperator;
import net.trustie.utils.StringHandler;
@@ -18,9 +19,8 @@ import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
-import core.AfterExtractor;
-import core.Page;
-import core.ValidateExtractor;
+import org.springframework.context.ApplicationContext;
+import org.springframework.context.support.ClassPathXmlApplicationContext;
import us.codecraft.webmagic.model.annotation.ExtractBy;
@ExtractBy("//div[@id='projects_show_page']")
@@ -49,10 +49,11 @@ public class openhub_Model implements AfterExtractor, ValidateExtractor {
// "| //*[@id='projects_show_page']/div[2]/div[3]/div[2]/div/*/*/a/regex(\".*Homepage\",1)/@href" //jquery multi links
// +"| //*[@id='projects_show_page']/div[2]/div[4]/div[2]/div/*/*/a/regex(\".*Homepage\",1)/@href "
//)
- @ExtractBy(value="//a/regex(\"\",1) " +
- " | //a/regex(\"\",1)" +
- " | //a/regex(\"\",1)")
-private List homepages = new ArrayList();
+// @ExtractBy(value="//a/regex(\"\",1) " +
+// " | //a/regex(\"\",1)" +
+// " | //a/regex(\"\",1)")
+ @ExtractBy(value="//a/regex(\"\",1) ")
+ private List homepages = new ArrayList();
private static String homepage ="";
///////////////////////////////////
@@ -401,6 +402,7 @@ private List homepages = new ArrayList();
}
+
@Override
public void validate(Page page) {
//
@@ -423,18 +425,43 @@ private List homepages = new ArrayList();
}
this.rateLevel = this.rateLevel.substring(0, this.rateLevel.indexOf("/"));
+
+
if (StringHandler.isAtLeastOneBlank(this.name, this.activity
,this.description
- ,getHomepage()
+ ,this.getHomepage()
/* ,
* this.licenses
*/)) {
page.setResultSkip(this, true);
+
+ //多homepage抽取时,下载失败,需要将对应page存库.downloadFailFlag为存库失败标志
+ if(downloadFailFlag) {
+
+ //存库开始,先将下载失败标志置为假
+ downloadFailFlag = false;
+
+ System.out.println("homepage页面下载失败,条目准备入库...");
+
+ openhub_retry_Model oprm = new openhub_retry_Model();
+ oprm.setUrl(page.getPageUrl());
+ oprm.setHtml(page.getRawText());
+
+ OpenHubRetry_Dao opDao = new OpenHubRetry_Dao();
+ opDao.add(oprm);
+
+ System.out.println("下载失败条目已存入数据库...");
+
+ }
+
return;
}
+
+
+
}
private void handleQuickRef(Element quickRef) {
@@ -1274,6 +1301,7 @@ private List homepages = new ArrayList();
this.history = history;
}
+ private boolean downloadFailFlag = false;
public String getValidHomepage(String homePage){
String result = homePage;
@@ -1281,6 +1309,16 @@ private List homepages = new ArrayList();
if(!homePage.equals("") && homePage.startsWith("/")){
//获取Homepage列表中的Homepage
result = new ExtractMutilLink4Openhub().extractLinks(homePage);
+ /**
+ * result=""时,需要对相应page加入存库处理。
+ * 这里必然是多homepage情况。所以此处若正常则result不为空,若为空则说明下载失败了,必须存库
+ */
+ if(result.equals("")) {
+
+ //存库标志位置为true
+ downloadFailFlag = true;
+
+ }
}
return result;
}
diff --git a/new_osseanextractor/src/main/java/net/trustie/model/openhub_retry_Model.java b/new_osseanextractor/src/main/java/net/trustie/model/openhub_retry_Model.java
new file mode 100644
index 0000000..892e742
--- /dev/null
+++ b/new_osseanextractor/src/main/java/net/trustie/model/openhub_retry_Model.java
@@ -0,0 +1,36 @@
+package net.trustie.model;
+
+/**
+ * Created by zaihuilvcha on 2017/2/18.
+ */
+public class openhub_retry_Model {
+ String url;
+ String html;
+
+
+
+ public void setUrl(String url) {
+ this.url = url;
+ }
+
+ public void setHtml(String html) {
+ this.html = html;
+ }
+
+ public String getUrl() {
+
+ return url;
+ }
+
+ public String getHtml() {
+ return html;
+ }
+
+ @Override
+ public String toString() {
+ return "openhub_retry_Model{" +
+ "url='" + url + '\'' +
+ ", html='" + html + '\'' +
+ '}';
+ }
+}
diff --git a/new_osseanextractor/src/main/java/net/trustie/model/sourceforge_Model.java b/new_osseanextractor/src/main/java/net/trustie/model/sourceforge_Model.java
index e740c11..0adb501 100644
--- a/new_osseanextractor/src/main/java/net/trustie/model/sourceforge_Model.java
+++ b/new_osseanextractor/src/main/java/net/trustie/model/sourceforge_Model.java
@@ -28,9 +28,9 @@ import java.util.regex.Pattern;
public class sourceforge_Model implements AfterExtractor, ValidateExtractor{
///////////////
-
-@ExtractBy("//a[@id='homepage']/@href " +
- " | //*[@class='homepage-link']/a/@href")
+//@ExtractBy("//a[@id='homepage']/@href " +
+// " | //*[@class='homepage-link']/a/@href")
+@ExtractBy("//*[@id='homepage']/@href ")
private String homepage;
//////////////////
diff --git a/new_osseanextractor/src/main/java/net/trustie/one/ExtractThread.java b/new_osseanextractor/src/main/java/net/trustie/one/ExtractThread.java
index 028d9b5..8468af9 100644
--- a/new_osseanextractor/src/main/java/net/trustie/one/ExtractThread.java
+++ b/new_osseanextractor/src/main/java/net/trustie/one/ExtractThread.java
@@ -84,13 +84,16 @@ public class ExtractThread implements Runnable{
Extractor extractor = new Extractor();
RawPage result = null;
while(pages.size() > 0){
-
+ /**
+ * 注意这里的page就是detail表中的一条记录。
+ */
for( RawPage page : pages){
try{
long startTime=System.currentTimeMillis(); //获取开始时间
result = extractor.extract(page,pageModel);
long endTime=System.currentTimeMillis(); //获取结束时间
-// System.out.println("页面抽取时间: "+(endTime-startTime)+"ms");
+// System.out.println("页面抽取时间: "+(endTime-startTime)+"ms");
+
startTime=System.currentTimeMillis(); //获取开始时间
//持久化 并 更新抽取历史
saveResult(site,result);
@@ -98,10 +101,11 @@ public class ExtractThread implements Runnable{
// System.out.println("结果保存时间: "+(endTime-startTime)+"ms");
}catch (Exception e){
- e.printStackTrace();
-// pageErrorOutPut.returnErrorPage(page, e);错误页面
+// e.printStackTrace();
+// pageErrorOutPut.returnErrorPage(page, e); //错误页面
}
}
+ //结果入库之后再更新lastId.注意,处理一批才会存一次
updateLastId(site,lastId + pages.size());
lastId = getLastId(site);
pages = getPages(site,lastId);
diff --git a/new_osseanextractor/src/main/java/net/trustie/one/OpenhubReExtractor.java b/new_osseanextractor/src/main/java/net/trustie/one/OpenhubReExtractor.java
index a88c412..ed9fe8e 100644
--- a/new_osseanextractor/src/main/java/net/trustie/one/OpenhubReExtractor.java
+++ b/new_osseanextractor/src/main/java/net/trustie/one/OpenhubReExtractor.java
@@ -95,7 +95,7 @@ class ReExtractThread implements Runnable{
saveResult(site, rawPage);
}catch (Exception e){
e.printStackTrace();
- // pageErrorOutPut.returnErrorPage(page, e);错误页面
+// pageErrorOutPut.returnErrorPage(page, e); //错误页面
}
}
//更新抽取游标
diff --git a/new_osseanextractor/src/main/java/net/trustie/utils/ExtractMutilLink4Openhub.java b/new_osseanextractor/src/main/java/net/trustie/utils/ExtractMutilLink4Openhub.java
index a7e0d51..99d9b02 100644
--- a/new_osseanextractor/src/main/java/net/trustie/utils/ExtractMutilLink4Openhub.java
+++ b/new_osseanextractor/src/main/java/net/trustie/utils/ExtractMutilLink4Openhub.java
@@ -12,7 +12,7 @@ import java.util.List;
* Description:抽取openhub中多链接的项目
*/
public class ExtractMutilLink4Openhub implements PageProcessor{
- private Site site = Site.me().setRetryTimes(10).setSleepTime(500);
+ private Site site = Site.me().setRetryTimes(5).setSleepTime(500);
private static List homepages;
private static Page _page;
@@ -40,27 +40,28 @@ public class ExtractMutilLink4Openhub implements PageProcessor{
}
}catch(Exception e) {
+
}
return ExtractMutilLink4Openhub.homepages;
}
/**
- * 获取多个link,用“;”间隔
+ * 获取多个link,用“;”间隔.此方法为二次抽取的入口方法
* @param url
* @return
*/
public String extractLinks(String url){
String homepage = "";
- doExtract(url);
+ doExtract(url); //(如果成功了的话)得到了homepages
try {
- List tempPages = getHomepages(url);
+ List tempPages = getHomepages(url); //以防初次抽取的homepages没抽成功,若没成功的话通过getHomepages继续抽
if (tempPages != null && !tempPages.isEmpty()) {
for (String link : tempPages) {
if(!homepage.contains(link)){//避免加入重复的链接
homepage += link + ";";
}
}
- homepage = homepage.substring(0, homepage.lastIndexOf(";"));
+ homepage = homepage.substring(0, homepage.lastIndexOf(";")); //将结果最后的“;”去除
}
}catch (Exception e){
e.printStackTrace();
@@ -89,7 +90,7 @@ public class ExtractMutilLink4Openhub implements PageProcessor{
* 抽取Homepage核心方法
* @param url 抽取目标url链接
*/
- public static void doExtract(String url){
+ public static void doExtract(String url) {
Spider.create(extractMutilLink4Openhub).addUrl(new String("https://www.openhub.net"+url)).thread(1).run();
}
diff --git a/project_manager/src/main/java/com/ossean/projectmanager/lasttabledao/OpenSourceProjectDao.java b/project_manager/src/main/java/com/ossean/projectmanager/lasttabledao/OpenSourceProjectDao.java
index 851ded2..bb60a54 100644
--- a/project_manager/src/main/java/com/ossean/projectmanager/lasttabledao/OpenSourceProjectDao.java
+++ b/project_manager/src/main/java/com/ossean/projectmanager/lasttabledao/OpenSourceProjectDao.java
@@ -47,10 +47,13 @@ public interface OpenSourceProjectDao {
@Param("tags") String tags);
// 批量获取项目
- @Select("select id,source,url,filtration from open_source_projects where filtration=0 limit #{batchSize}")
- public List getBatchPrjs(
+ @Select("select id,source,url,filtration from open_source_projects where filtration=0 and id >= #{startId} limit #{batchSize}")
+ public List getBatchPrjs(@Param("startId") int startId,
@Param("batchSize") int batchSize);
+ @Select("select min(id) from open_source_projects where filtration=#{filtration}")
+ public int getMinId(@Param("filtration") int filtration);
+
// 删除项目
@Update("delete from open_source_projects where id=#{id}")
public void deleteProject(@Param("id") int id);
diff --git a/project_manager/src/main/java/com/ossean/projectmanager/model/OpenhubProject.java b/project_manager/src/main/java/com/ossean/projectmanager/model/OpenhubProject.java
index ef01778..f4129a1 100644
--- a/project_manager/src/main/java/com/ossean/projectmanager/model/OpenhubProject.java
+++ b/project_manager/src/main/java/com/ossean/projectmanager/model/OpenhubProject.java
@@ -3,7 +3,7 @@ package com.ossean.projectmanager.model;
public class OpenhubProject {
private String description;
private String name;
- private String codeRepository;
+ private String code_repository;
public String getDescription() {
return description;
}
@@ -16,10 +16,11 @@ public class OpenhubProject {
public void setName(String name) {
this.name = name;
}
- public String getCodeRepository() {
- return codeRepository;
+ public String getCode_repository() {
+ return code_repository;
}
- public void setCodeRepository(String codeRepository) {
- this.codeRepository = codeRepository;
+ public void setCode_repository(String code_repository) {
+ this.code_repository = code_repository;
}
+
}
diff --git a/project_manager/src/main/java/com/ossean/projectmanager/projectsfilter/ProjectsFilter.java b/project_manager/src/main/java/com/ossean/projectmanager/projectsfilter/ProjectsFilter.java
index ea5c37c..7f30f50 100644
--- a/project_manager/src/main/java/com/ossean/projectmanager/projectsfilter/ProjectsFilter.java
+++ b/project_manager/src/main/java/com/ossean/projectmanager/projectsfilter/ProjectsFilter.java
@@ -37,9 +37,10 @@ public class ProjectsFilter {
*/
public void filtratePrjs() {
logger.info("Reading projects......");
+ int startId = lastProjectDao.getMinId(0);
while (true) {
List prjsList = lastProjectDao
- .getBatchPrjs(batchsize);
+ .getBatchPrjs(startId,batchsize);
if(prjsList.size()==0){
logger.info("Filter done......sleeping......");
try {
@@ -81,16 +82,16 @@ public class ProjectsFilter {
&& !"".equals(openhubProject.getName())
&& openhubProject.getDescription() != null
&& !"".equals(openhubProject.getDescription())
- && openhubProject.getCodeRepository() != null
- && !openhubProject.getCodeRepository().contains(
- "add a code location")) { // openhub的筛选条件为name、description不为空,且该项目有版本库
+ && openhubProject.getCode_repository() != null
+ && !openhubProject.getCode_repository().contains(
+ "Add a code location")) { // openhub的筛选条件为name、description不为空,且该项目有版本库
lastProjectDao.updateFiltratedPrj(project.getId(),
1); // 筛选标识从0变为1,表示该项目经过筛选新增的
matchResultDao.deleteMatchResult(
getTargetTable(project.getId()),
project.getId()); // 删除该项目的匹配结果,确保无之前的匹配结果
} else {
- lastProjectDao.deleteProject(project.getId());
+ //lastProjectDao.deleteProject(project.getId());
matchResultDao.deleteMatchResult(
getTargetTable(project.getId()),
project.getId()); // 删除该项目的匹配结果
@@ -111,7 +112,7 @@ public class ProjectsFilter {
getTargetTable(project.getId()),
project.getId()); // 删除该项目的匹配结果,确保无之前的匹配结果
} else {
- lastProjectDao.deleteProject(project.getId());
+ //lastProjectDao.deleteProject(project.getId());
matchResultDao.deleteMatchResult(
getTargetTable(project.getId()),
project.getId()); // 删除该项目的匹配结果
@@ -127,6 +128,7 @@ public class ProjectsFilter {
logger.info("Unknown source... source = " + source);
}
}
+ startId = prjsList.get(prjsList.size()-1).getId()+1;
}
diff --git a/project_manager/src/main/resource/applicationContext-myBatis.xml b/project_manager/src/main/resource/applicationContext-myBatis.xml
index 992a3e4..d75abb8 100644
--- a/project_manager/src/main/resource/applicationContext-myBatis.xml
+++ b/project_manager/src/main/resource/applicationContext-myBatis.xml
@@ -39,9 +39,9 @@
destroy-method="close">
+ value="jdbc:mysql://localhost:3306/ossean_production?characterEncoding=UTF-8&zeroDateTimeBehavior=convertToNull&autoReconnect=true" />
-
+
diff --git a/project_match/.project b/project_match/.project
index 55bc6cd..3652b46 100644
--- a/project_match/.project
+++ b/project_match/.project
@@ -15,8 +15,14 @@
+
+ org.springframework.ide.eclipse.core.springbuilder
+
+
+
+ org.springframework.ide.eclipse.core.springnature
org.eclipse.jdt.core.javanature
org.eclipse.m2e.core.maven2Nature
diff --git a/project_match/src/main/java/com/ossean/ClearTableBeforTransfer.java b/project_match/src/main/java/com/ossean/ClearTableBeforTransfer.java
index 39a13c2..0300c32 100644
--- a/project_match/src/main/java/com/ossean/ClearTableBeforTransfer.java
+++ b/project_match/src/main/java/com/ossean/ClearTableBeforTransfer.java
@@ -17,7 +17,7 @@ public class ClearTableBeforTransfer {
Logger logger = Logger.getLogger(ClearTableBeforTransfer.class);
public void start(){
truncateTable(TableName.openSourceProjectsTableName);
- deleteItemInEddPointers(TableName.eddRelationTableName, TableName.openSourceProjectsTableName);
+ deleteItemInEddPointers(TableName.pointerTableName,TableName.eddRelationTableName, TableName.openSourceProjectsTableName);
deleteTaggingsForProject(TableName.taggingsTableName);
logger.info("完成转移程序运行前数据表的清空和taggings表OpenSourceProject的删除");
}
@@ -58,8 +58,8 @@ public class ClearTableBeforTransfer {
}
}
//删除edd_pointers表中transfer对应的数据
- public void deleteItemInEddPointers(String sourceTableName, String targetTableName){
- String sql = "delete from edd_pointers where SourceTableName=? and TargetTableName=?";
+ public void deleteItemInEddPointers(String pointerName,String sourceTableName, String targetTableName){
+ String sql = "delete from "+pointerName+" where SourceTableName=? and TargetTableName=?";
Connection conn = getConnection();
if(conn == null){
logger.info("没有获取到Connection");
diff --git a/project_match/src/main/java/com/ossean/GetSynonyms.java b/project_match/src/main/java/com/ossean/GetSynonyms.java
index 9950976..8079a60 100644
--- a/project_match/src/main/java/com/ossean/GetSynonyms.java
+++ b/project_match/src/main/java/com/ossean/GetSynonyms.java
@@ -38,6 +38,12 @@ public class GetSynonyms{
private static String synonymsTableName = TableName.synonymsTableName;
private static String pointerTableName = TableName.pointerTableName;
private static int batchSize = 1000;
+
+
+ //为了很好的将别名提取的程序结构化,特设以下三个全局变量在从项目名称中提取别名时使用
+ String projectNameWithoutBracket = "";//项目名去除括号
+ String projectNameWithoutComName = "";//项目名称去除括号和公司、基金名字
+ String bracket = "";//括号中内容
public void start() {
long start_all = System.currentTimeMillis();
@@ -99,207 +105,517 @@ public class GetSynonyms{
}
public Map getSynonyms(GatherProjectsModel model) {
-
+ if(model.getSource().equals("github"))
+ {
+ return getSynonymsFromGithubPrj(model);//处理github项目
+ }
+ else
+ {
+ return getSynonymsNotGitPrj(model);//处理非github的其他项目
+ }
+ }
+
+ @SuppressWarnings("finally")
+ public Map getSynonymsFromGithubPrj(GatherProjectsModel model)
+ {
- Map SynonymMap = new HashMap();// 记录每个别名及其准确度,表时Synonym可靠性很大
- List synonymsList = new ArrayList();// 最终返回的同义词列表
- List synonymsList_notsure = new ArrayList();
model.setName(model.getName().trim());
- if(model.getDescription()!=null && !model.getDescription().equals(""))
+ if(model.getDescription() != null)
+ {
model.setDescription(model.getDescription().trim());
+ }
+ else
+ {
+ model.setDescription("");
+ }
+
+ //记录每个别名及其准确度,表时Synonym可靠性很大
+ Map SynonymMap = new HashMap();
+
+ //最终返回的可以确定正确的同义词列表
+ List synonymsList = new ArrayList();
+
+ String description = model.getDescription();
+ String prjName = model.getName();
+
+ try
+ {
+ //github项目名称中有斜杠,提取斜杠后内容做为别名
+ String subName = getSubName(prjName);
+ if(subName != null)
+ {
+ SynonymMap.put(subName, 1);
+ }
+
+
+ //将subName中短横线、下划线用空格代替以及将短横线、下划线删除后获取的字符串当做别名
+ String strReplace = ReplaceSpecial(subName);
+ if(!strReplace.equals(subName))
+ {
+ synonymsList.add(strReplace);
+ }
+
+ //删除特殊符号如短横线、下划线,如果其包含在description中则做为别名
+ String strDelete = DeleteSpecial(subName);
+ if(!strDelete.equals(subName))
+ {
+ if(description.toLowerCase().contains(strDelete.toLowerCase()))
+ {
+ synonymsList.add(strDelete);
+ }
+ }
+
+
+ if(description == null)
+ {
+ //如果描述信息为空,别名就只包括已经提取出的和从名字中获取的别名
+ if(synonymsList != null)
+ for(String sure:synonymsList)
+ SynonymMap.put(sure, 1);
+
+ return SynonymMap;
+ }
+
+ //描述信息非空时,尝试从description中提取别名
+ Map synonymsFromDesc = getSynonymsFromDescOfGitprj(description,model.getName());
+
+ //将别名连同其确定性一起放入SynonymMap中
+ for(String tmpSyn:synonymsList)
+ SynonymMap.put(tmpSyn, 1);
+
+ //将从描述信息中获取的别名及其确定性放入SynonymMap
+ if(synonymsFromDesc != null)
+ for(String key:synonymsFromDesc.keySet())
+ SynonymMap.put(key, synonymsFromDesc.get(key));
+
+ return SynonymMap;
+ }catch(Exception e)
+ {
+ e.printStackTrace();
+ logger.error("Error while extracting synonyms from project:" + model.getId());
+ System.exit(0);
+ }finally
+ {
+ return SynonymMap;
+
+ }
+
+ }
+
+ private Map getSynonymsNotGitPrj(GatherProjectsModel model) {
+ model.setName(model.getName().trim());
+ if(model.getDescription() != null)
+ model.setDescription(model.getDescription().trim());
+
+ List synonymsList = new ArrayList();//能够确定正确的别名
+ List synonymsList_notsure = new ArrayList();//不能百分百确定的别名
+
String description = model.getDescription();
String prjName = model.getName();
List result = null;
- if (description == null) {
+ if(description == null)
+ {
+ //如果description为null,则别名无需再做改动直接返回null
return null;
}
-
- List linkNameList = new ArrayList();// 用于存储所有提取出来的关联项目名
- List keywords = FileReader.read("./files/language.txt");
-
- // 对项目名进行处理,去除公司名或基金组织的信息&&去除括号(没有公司没有括号则包含自己)
- String projectNameWithoutBracket = StringHandler_ProjectName
- .removeBracket(prjName);
- String projectNameWithoutComName = StringHandler_ProjectName
- .getProjectWithoutComName(projectNameWithoutBracket);
- // String projectNameWithoutComName =
- // StringHandler_ProjectName.getProjectWithoutComName(prjName);
- // projectNameWithoutComName =
- // StringHandler_ProjectName.removeBracket(projectNameWithoutComName);
- if (!"".equals(projectNameWithoutComName)) {
+
+ List linkNameList = new ArrayList();//用于存储所有提取出来的可能是别名的词
+
+ List keywords = FileReader.read("./files/language.txt");//用于过滤类似于"Java","C++"这样的词
+ projectNameWithoutBracket = StringHandler_ProjectName.removeBracket(prjName);
+ projectNameWithoutComName = StringHandler_ProjectName.getProjectWithoutComName(projectNameWithoutBracket);
+ //提取项目名括号中的信息,不是项目类型
+ bracket = StringHandler_ProjectName.getBracket(prjName);
+
+ //去除括号及括号内容后即可做为别名
+ if(!"".equals(projectNameWithoutComName))
+ {
synonymsList.add(projectNameWithoutComName);
- // logger.info("gets synonyms:" + projectNameWithoutComName +
- // " Without ComName and remove racket");
+ logger.info("gets synonyms:" + projectNameWithoutComName + " Without ComName and remove racket");
}
-
- // 提取项目名括号中的信息,不是项目类型
- String bracket = StringHandler_ProjectName.getBracket(prjName);
- if (bracket != null && !"".equals(bracket)) {
- bracket = RegexHandler.extractEngDecChiAndDot(bracket);// 去除括号中信息的特殊符号
- if (!RegexHandler.onlySpecialChar(bracket)) {
- if (!keywords.contains(bracket.toLowerCase())) {// 括号中的信息不是语言类关键字词
- if (StringHandler.isFullOrShort(bracket,
- projectNameWithoutBracket)) {
- synonymsList.add(bracket);
- } else {
- if (StringHandler.specialIsFullOrShort(bracket,
- projectNameWithoutBracket))
+
+
+ try
+ {
+ //TODO 将对项目名称的处理过程独立为一个函数
+ getSynonymsFromName(synonymsList,linkNameList,keywords);
+
+ if(description != null)
+ getSynonymsFromDescOfNotGitprj(model,description,prjName,synonymsList,linkNameList,keywords);
+
+ //result包含去掉公司信息、括号内的、be动词之前的
+ result = removeDupStr(linkNameList);
+
+ //TODO 将从linkName中提取全/缩写独立出来
+ getSynonymsByFullOrShort(result,synonymsList);
+
+ //TODO 将从linkName中依据相似度提取别名的程序独立出来
+ getSynonymsBySimilarDegree(model.getName(),result,synonymsList_notsure,keywords,"notGitPrj");
+ }catch(Exception e)
+ {
+ logger.error("Error while extracting synonym from prj:" + model.getId() + ":" +e);
+ }
+
+ return afterExtract(synonymsList, synonymsList_notsure);
+ }
+
+ public void getSynonymsFromName(List synonymsList,List linkNameList,List keywords)
+ {
+ if(bracket != null && projectNameWithoutBracket != null && projectNameWithoutComName != null)
+ {
+ // Map SynonymMap = new HashMap();
+ if(!"".equals(bracket))
+ {
+ bracket = RegexHandler.extractEngDecChiAndDot(bracket);//去除括号中信息的特殊符号
+ if(!RegexHandler.onlySpecialChar(bracket))
+ {
+ if(!keywords.contains(bracket.toLowerCase()))//括号中的信息不是语言类关键字词
+ {
+ //如果括号中内容个项目名存在全/缩写关系
+ if(StringHandler.isFullOrShort(bracket, projectNameWithoutBracket))
+ {
synonymsList.add(bracket);
+ //SynonymMap.put(bracket, 1);
+ }
else
- linkNameList.add(bracket);
+ {
+ if(StringHandler.specialIsFullOrShort(bracket, projectNameWithoutBracket))
+ synonymsList.add(bracket);
+ else linkNameList.add(bracket);
+ }
+
}
-
}
}
}
-
- if (description != null) {
-
- List beWords = FileReader.read("./files/beVerb.txt");// 读取be动词表
- description = StringHandler.getFirstSentence(description);
-
- for (String beWord : beWords) {
-
- String linkName = StringHandler.findLinkName(description,
- beWord);// 按系动词表优先级进行匹配
- if (linkName == null) {// 表示没有匹配到当前的系动词 进行下一个匹配
- continue;
- }
- if (linkName.toLowerCase().equals(prjName.toLowerCase()))
- continue;
- // 特征短语去噪
- List linkNameResult = EDDHandler
- .getMainAndViceDescriptionByBracket(linkName);// 提取主副描述信息(括号中)
- linkNameResult = EDDHandler
- .getMainAndViceDescriptionByKnowAs(linkNameResult);// 提取主副描述信息(known
- // as+连词)
- linkNameResult = EDDHandler.removeComma(linkNameResult);// 去除定语描述信息(逗号定语)
- linkNameResult = EDDHandler
- .removeDemonstrativeWords(linkNameResult);// 去除定语描述信息(定冠词The)
- linkNameResult = EDDHandler
- .removeArrtibutiveClause(linkNameResult);// 去除从句描述信息
- linkNameResult = EDDHandler.removePronoun(linkNameResult);// 去除指示代词
- linkNameResult = EDDHandler.removeProject(linkNameResult);// 去除项目常用词
- linkNameResult = EDDHandler.removePPCombine(linkNameResult);// 去除指示代词和项目常用词两者的笛卡尔积
- linkNameResult = RegexHandler
- .extractEngDecChiAndDot(linkNameResult);// 提取项目中
- linkNameResult = RegexHandler.removeOnlySpecial(linkNameResult);// 删除只有特殊字符的同义词
- for (String name : linkNameResult) {
- // logger.info("be 动词之前的有: "+name);
- // be动词前不包括项目类型,且和项目名不相同
- if (!keywords.contains(name.toLowerCase().trim())
- && !name.toLowerCase().trim()
- .equals(prjName.toLowerCase().trim())
- && !name.toLowerCase()
- .trim()
- .equals(projectNameWithoutComName
- .toLowerCase().trim())) {
- // 这里的prjName是完整的name
- if (prjName.toLowerCase().indexOf(name.toLowerCase()) != -1) {
- String withoutShortDash = RemoveShortDash(name);
- if (bracket == null || bracket.equals(""))
- continue;
- if (name.toLowerCase()
- .equals(bracket.toLowerCase())
- || name.toLowerCase().equals(
- withoutShortDash))
- synonymsList.add(name);
- else {
- int nameLength1 = withoutShortDash.split(" ").length;
- int nameLength2 = projectNameWithoutBracket
- .split(" ").length;
- int nameLength = name.trim().split(" ").length;
- double ratio1 = (nameLength1 / nameLength) * 1.0;
- double ratio2 = (nameLength2 / nameLength) * 1.0;
- if (nameLength1 > 1 && ratio1 > 0.6)
- synonymsList.add(name);
- else {
- if (nameLength2 > 1 && ratio2 > 0.6)
- synonymsList.add(name);
- else
- linkNameList.add(name);
- }
- }
-
- // synonymsList.add(name);//问题出在这个地方,如果从discription中提取出的词在名字中有出现就直接加入,这
- // 会导致比如Ajax被识别为Ajax Implement的别名,而实际上并不是这样的。
- // logger.info("gets synonyms:" + name +
- // " from description before be verb");
- } else
- linkNameList.add(name);
- }
-
- }
- }
- }
- // result包含去掉公司信息、括号内的、be动词之前的
- result = removeDupStr(linkNameList);
- // logger.info("result去掉公司信息、括号内的、be动词之前的;");
-// for (int i = 0; i < result.size(); i++)
-// System.out.println("result: " + result.get(i));
-
- // 去除重复的
-
- String fullName = StringHandler.getFullName(
- projectNameWithoutComName.toLowerCase(), result);
- if (!fullName.equals("")) {
- synonymsList.add(fullName.toLowerCase());// 表示项目名确实是缩写 并提取到了项目全名
- // logger.info("项目的全称为: "+fullName);
- }
-
- String shortName = StringHandler.getShortName(
- projectNameWithoutComName.toLowerCase(), result);
- if (!shortName.equals("")) {
- synonymsList.add(shortName.toLowerCase());// 表示项目名确实是全称 并提取到了项目缩写
- // logger.info("项目的简称为: "+shortName);
- }
-
- // 同义词要和项目名称有一定的相似度
- for (int i = 0; i < result.size(); i++) {
+ }
+
+ public void getSynonymsBySimilarDegree(String Name,List result,List synonymsList_notsure,List keywords,String prjSource)
+ {
+ //同义词要和项目名称有一定的相似度
+ for(int i = 0; i < result.size(); i++){
String extract = "";
String synonym = result.get(i);
- String[] words = synonym.split(" ");// 按照空格进行分词
- for (String word : words) {
- if (model.getName().toLowerCase().indexOf(word.toLowerCase()) >= 0) {// 查看每个词在项目名中是否出现
+ String[] words = synonym.split(" ");//按照空格进行分词
+ for(String word:words){
+ if(Name.toLowerCase().indexOf(word.toLowerCase()) >= 0){//查看每个词在项目名中是否出现
extract += word + " ";
}
}
- if (extract.length() != 0)
+ if(extract.length() !=0)
extract = extract.substring(0, extract.length() - 1);
- // logger.info("extract : "+extract);
- if (!"".equals(extract) && !keywords.contains(extract)) {
- // 提取的别名不和项目名重复
- if (!extract.toLowerCase().equals(prjName.toLowerCase().trim())
- && !extract
- .toLowerCase()
- .trim()
- .equals(projectNameWithoutComName.toLowerCase()
- .trim())) {
- double similarDegree = SimilarDegree(extract,
- projectNameWithoutComName);
- // System.out.println("字符串间的相似度为: "+similarDegree);
-
- if (similarDegree >= 0.6 && similarDegree < 1.0) {
- // logger.info("from extract after similar: "+extract);
+ //logger.info("extract : "+extract);
+ if(!"".equals(extract) && !keywords.contains(extract))
+ {
+ boolean AbleToCalcSimilar = false;
+ if(prjSource.equals("github"))
+ {
+ String strReplace = ReplaceSpecial(getSubName(Name));
+ if(!extract.toLowerCase().equals(Name.toLowerCase().trim()) && !extract.toLowerCase().trim().equals(strReplace.toLowerCase().trim()))
+ AbleToCalcSimilar = true;
+ }
+ if(!prjSource.equals("notGitPrj"))
+ {
+ if(Name!= null && projectNameWithoutBracket != null)
+ if(!extract.toLowerCase().equals(Name.toLowerCase().trim()) && !extract.toLowerCase().trim().equals(projectNameWithoutComName.toLowerCase().trim()))
+ AbleToCalcSimilar = true;
+ }
+ //提取的别名不和项目名重复
+ if(AbleToCalcSimilar)
+ {
+ double similarDegree = 0.0;
+ if(!prjSource.equals("github")) similarDegree= SimilarDegree(extract,projectNameWithoutComName);
+ if(prjSource.equals("github")) similarDegree = SimilarDegree(extract, Name);
+ if(similarDegree >= 0.6 && similarDegree <1.0)
+ {
+ logger.info("from extract after similar: "+extract);
synonymsList_notsure.add(extract);
}
}
}
}
- removeDupStr(synonymsList);// 去除同名列表中重复的
- removeDupStr(synonymsList_notsure);
- for (String s : synonymsList) {
- // logger.info("finally synonyms:" + s + " by description");
+ }
+
+ public void getSynonymsByFullOrShort(List result,ListsynonymsList)
+ {
+ String fullName = StringHandler.getFullName(projectNameWithoutComName.toLowerCase(), result);
+ if(!fullName.equals("")){
+ synonymsList.add(fullName.toLowerCase());//表示项目名确实是缩写 并提取到了项目全名
}
+
+ String shortName = StringHandler.getShortName(projectNameWithoutComName.toLowerCase(), result);
+ if(!shortName.equals("")){
+ synonymsList.add(shortName.toLowerCase());//表示项目名确实是全称 并提取到了项目缩写
+ }
+ }
+
+ public void getSynonymsFromDescOfNotGitprj(GatherProjectsModel model,String description,String prjName,List synonymsList,List linkNameList,List keywords)
+ {
- for (String sure : synonymsList)
- SynonymMap.put(sure, 1);
- for (String Notsure : synonymsList_notsure)
- SynonymMap.put(Notsure, 0);
- return SynonymMap;
+
+ List beWords = FileReader.read("./files/beVerb.txt");//读取be动词表
+ //TODO 将对description的分词处理部分独立出来
+ description = StringHandler.getFirstSentence(description);
+ for(String beWord:beWords){
+
+ String linkName = StringHandler.findLinkName(description, beWord);//按系动词表优先级进行匹配
+ if(linkName == null){//表示没有匹配到当前的系动词 进行下一个匹配
+ continue;
+ }
+ if(linkName.toLowerCase().equals(prjName.toLowerCase()))
+ continue;
+ //特征短语去噪
+ List linkNameResult =getLinkNameResult(linkName);
+
+ handleLinkNameResult(model,linkNameResult,synonymsList,linkNameList,keywords);
+ }
+
+ }
+ public Map getSynonymsFromDescOfGitprj(String description,String prjName)
+ {
+ List keywords = FileReader.read("./files/language.txt");
+ List beWords = FileReader.read("./files/beVerb.txt");
+ //程序中只使用description的第一句进行处理。
+ String descriptionOfFirstSentence = StringHandler.getFirstSentence(description);
+ //获取斜杠后名字
+ String subName = getSubName(prjName);
+
+ //将subName中的特殊符号用空格代替
+ String strReplace = ReplaceSpecial(subName);
+ //将subName中的特殊符号直接删除
+ String strDelete = DeleteSpecial(subName);
+ List linkNameList = new ArrayList();
+ List synonymsList = new ArrayList();//确定正确的别名
+ List synonymsList_not_sure = new ArrayList();//不确定是否正确但是满足提取规则的别名
+
+// int matchBewords = 0;//指示description中是否有be动词,为0则没有匹配到任何be动词,大于1则说明有匹配到be动词
+ List linkNameResult = new ArrayList();//存放description分词后的处理结果
+ int length = beWords.size();
+ for(int k=0;kresult = removeDupStr(linkNameList);
+ getSynonymsBySimilarDegree(subName, result, synonymsList_not_sure, keywords,"github");
+ return afterExtract(synonymsList,synonymsList_not_sure);
+ }
+
+ public List getLinkNameResult(String linkName)
+ {
+ //特征短语去噪
+ List linkNameResult = EDDHandler.getMainAndViceDescriptionByBracket(linkName);//提取主副描述信息(括号中)
+ linkNameResult = EDDHandler.getMainAndViceDescriptionByKnowAs(linkNameResult);//提取主副描述信息(known as+连词)
+ linkNameResult = EDDHandler.removeComma(linkNameResult);//去除定语描述信息(逗号定语)
+ linkNameResult = EDDHandler.removeDemonstrativeWords(linkNameResult);//去除定语描述信息(定冠词The)
+ linkNameResult = EDDHandler.removeArrtibutiveClause(linkNameResult);//去除从句描述信息
+ linkNameResult = EDDHandler.removePronoun(linkNameResult);//去除指示代词
+ linkNameResult = EDDHandler.removeProject(linkNameResult);//去除项目常用词
+ linkNameResult = EDDHandler.removePPCombine(linkNameResult);//去除指示代词和项目常用词两者的笛卡尔积
+ linkNameResult = RegexHandler.extractEngDecChiAndDot(linkNameResult);//提取项目中
+ linkNameResult = RegexHandler.removeOnlySpecial(linkNameResult);//删除只有特殊字符的同义词
+
+ return linkNameResult;
+ }
+
+ public void handleLinkNameResult(GatherProjectsModel model,List linkNameResult,List synonymsList,ListlinkNameList,Listkeywords)
+ {
+ for(String name:linkNameResult){
+ logger.info("be 动词之前的有: "+name);
+ //be动词前不包括项目类型,且和项目名不相同
+ if(!keywords.contains(name.toLowerCase().trim()) && !name.toLowerCase().trim().equals(model.getName().toLowerCase().trim())
+ && !name.toLowerCase().trim().equals(projectNameWithoutComName.toLowerCase().trim())){
+ //这里的prjName是完整的name
+ if(model.getName().toLowerCase().indexOf(name.toLowerCase()) != -1){
+ String withoutShortDash = RemoveShortDash(name);
+ if(name.toLowerCase().equals(bracket.toLowerCase()) || name.toLowerCase().equals(withoutShortDash))
+ synonymsList.add(name);
+ else
+ {
+ int nameLength1 = withoutShortDash.split(" ").length;
+ int nameLength2 = projectNameWithoutBracket.split(" ").length;
+ int nameLength = name.trim().split(" ").length;
+ double ratio1= (nameLength1/nameLength) * 1.0;
+ double ratio2 =(nameLength2/nameLength) * 1.0;
+ if(nameLength1 > 1 && ratio1 > 0.6)
+ synonymsList.add(name);
+ else
+ {
+ if(nameLength2 > 1 && ratio2 > 0.6)
+ synonymsList.add(name);
+ else linkNameList.add(name);
+ }
+ }
+ }
+ else
+ linkNameList.add(name);
+ }
+
+ }
+ }
+
+ //取斜杠后面的字符串,如果没有斜杠就返回字符串本身
+ private String getSubName(String prjName) {
+ // TODO Auto-generated method stub
+ String SubName = prjName;
+ if(prjName.indexOf('/') == -1)
+ return SubName;
+ else
+ SubName = prjName.substring(prjName.indexOf('/'));
+ return SubName;
+ }
+
+ public static String DeleteSpecial(String subName) {
+ // TODO Auto-generated method stub
+ int deleteNum=0;
+ StringBuffer str=new StringBuffer(subName);
+ for(int i=0;i< subName.length();i++)
+ {
+ //去除下划线
+ if(subName.charAt(i) == '_')
+ {
+ str.deleteCharAt(i-deleteNum);
+ deleteNum++;
+
+ }
+ //去除短横线
+ if(subName.charAt(i) == '-')
+ {
+ //System.out.println("字符位置在" + i + " and 字符是" + subName.charAt(i));
+ str.deleteCharAt(i-deleteNum);
+ deleteNum++;
+ }
+ if(subName.charAt(i) == '.')
+ {
+ //System.out.println("字符位置在" + i + " and 字符是" + subName.charAt(i));
+ str.deleteCharAt(i-deleteNum);
+ deleteNum++;
+ }
+ if(subName.charAt(i)== '/')
+ {
+ //System.out.println("字符位置在" + i + " and 字符是" + subName.charAt(i));
+ str.deleteCharAt(i-deleteNum);
+ deleteNum++;
+ }
+ }
+
+ return str.toString();
+
+ }
+
+
+ public static String ReplaceSpecial(String subName) {
+ // TODO Auto-generated method stub
+ StringBuffer str=new StringBuffer(subName);
+ for(int i=0;iafterExtract(List synonymsList,ListsynonymsList_not_sure)
+ {
+ Map SynonymMap = new HashMap();
+ if(synonymsList != null)
+ {
+ synonymsList = removeDupStr(synonymsList);
+ if(synonymsList != null)
+ for(String sure:synonymsList)
+ {
+ SynonymMap.put(sure, 1);
+ }
+ }
+
+ if(synonymsList_not_sure != null)
+ {
+ synonymsList_not_sure = removeDupStr(synonymsList_not_sure);
+ for(String Notsure:synonymsList_not_sure)
+ {
+ SynonymMap.put(Notsure, 0);
+ }
+ }
+
+ return SynonymMap;
+ }
+
+ public String getSynonymByFullOrShortOfGitPrj(String subName,String subName2,String descWithoutBracketHandled)
+ {
+ if(StringHandler.isFullOrShort(ReplaceSpecial(descWithoutBracketHandled).toLowerCase(), subName.toLowerCase()) ||
+ StringHandler.specialIsFullOrShortForGitPrj(ReplaceSpecial(descWithoutBracketHandled).toLowerCase(), subName.toLowerCase()))
+ {
+ return descWithoutBracketHandled;
+ //System.out.println("ReplaceSpecial提取到别名:" + descWithoutBracketHandled);
+ }
+// if(StringHandler.isFullOrShort(DeleteSpecial(descWithoutBracketHandled).toLowerCase(), subName2.toLowerCase()) ||
+// StringHandler.specialIsFullOrShortForGitPrj(DeleteSpecial(descWithoutBracketHandled).toLowerCase(), subName2.toLowerCase()))
+// {
+// return descWithoutBracketHandled;
+// //System.out.println("DeleteSpecial提取到别名:" + descWithoutBracketHandled);
+// }
+ return null;
+ }
public static String RemoveShortDash(String name) {
String withoutShortDash = "";
if (name.indexOf('-') > -1)
diff --git a/project_match/src/main/java/com/ossean/TestDate.java b/project_match/src/main/java/com/ossean/TestDate.java
index 9f048a5..f3e4274 100644
--- a/project_match/src/main/java/com/ossean/TestDate.java
+++ b/project_match/src/main/java/com/ossean/TestDate.java
@@ -4,6 +4,7 @@ import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
+import java.util.Set;
import javax.annotation.Resource;
@@ -46,13 +47,15 @@ public class TestDate {
}
public void start_test(){
- String homepage = "http://github.com/apache/abdera;http://incubator.apache.org/abdera/";
- String homepage1 = "http://incubator.apache.org/abdera/";
- String arr[] = homepage1.split(";");
- System.out.println(arr.length);
- for(String a : arr){
- System.out.println(a);
- }
+// String homepage = "http://github.com/apache/abdera;http://incubator.apache.org/abdera/";
+// String homepage1 = "http://incubator.apache.org/abdera/";
+// String arr[] = homepage1.split(";");
+// System.out.println(arr.length);
+// for(String a : arr){
+// System.out.println(a);
+// }
+ Set set = dbDest.selectOpenSourceProjectsIds("gather_projects_test");
+ System.out.println(set.size());
diff --git a/project_match/src/main/java/com/ossean/TransferProjects.java b/project_match/src/main/java/com/ossean/TransferProjects.java
index 2746a07..d0603e4 100644
--- a/project_match/src/main/java/com/ossean/TransferProjects.java
+++ b/project_match/src/main/java/com/ossean/TransferProjects.java
@@ -67,7 +67,7 @@ public class TransferProjects {
}
public void start(){
- boolean isUpdate = false;
+
haveTransferSet = new HashSet();
haveTransferSet = dbDest.selectOpenSourceProjectsIds(targetTableName);
int delCount = 0;
@@ -85,6 +85,7 @@ public class TransferProjects {
}
}
for(EddRelations relation:eddRelationList){
+ boolean isUpdate = false;
String gather_projects_ids = relation.getGather_projects_ids();
try {
gather_projects_ids = gather_projects_ids.substring(1, gather_projects_ids.length() - 1);
diff --git a/project_match/src/main/java/com/ossean/util/MergeProjectNew2.java b/project_match/src/main/java/com/ossean/util/MergeProjectNew2.java
index a74c5dd..11a84f5 100644
--- a/project_match/src/main/java/com/ossean/util/MergeProjectNew2.java
+++ b/project_match/src/main/java/com/ossean/util/MergeProjectNew2.java
@@ -208,7 +208,7 @@ public class MergeProjectNew2 {
matchedIdList.add(uniqueModel.getId());
}
for (GatherProjectsModel tmp : sameHomeMap.get(uniqueSource)) {
- if (!tmp.equals(uniqueModel) && tmp.getUpdate_mark() != 2) {
+ if (tmp.getId() != uniqueModel.getId() && tmp.getUpdate_mark() != 2) {
// 插入关系表
String relationStr = "," + tmp.getId() + ",";
dbSource.insertEddRelations(eddRelationTableName,
@@ -319,7 +319,7 @@ public class MergeProjectNew2 {
GatherProjectsModel uniqueModel = getUniqueSourceModel(map
.get(uniqueSource));
for (GatherProjectsModel tmp : map.get(uniqueSource)) {
- if (!tmp.equals(uniqueModel)) {
+ if (tmp.getId()!= uniqueModel.getId()) {
// 插入关系表
String tmpStr = "," + tmp.getId() + ",";
if (tmp.getUpdate_mark() != 2) {
@@ -439,7 +439,7 @@ public class MergeProjectNew2 {
uniqueModel = getUniqueSourceModel(map.get(uniqueSource));
for (GatherProjectsModel tmp : map.get(uniqueSource)) {
- if (!tmp.equals(uniqueModel) && tmp.getUpdate_mark() != 2) {
+ if (tmp.getId() != uniqueModel.getId() && tmp.getUpdate_mark() != 2) {
// 插入关系表
String relationStr = "," + tmp.getId() + ",";
dbSource.insertEddRelations(eddRelationTableName,
@@ -774,7 +774,7 @@ public class MergeProjectNew2 {
public void test() {
GatherProjectsModel model = gatherDao.selectGPMById(
- gatherProjectsTableName, 1);
+ gatherProjectsTableName, 543);
handleNewProject(model, false);
}
}
diff --git a/project_match/src/main/java/com/ossean/util/StringHandler.java b/project_match/src/main/java/com/ossean/util/StringHandler.java
index c3c11ee..52d2d42 100644
--- a/project_match/src/main/java/com/ossean/util/StringHandler.java
+++ b/project_match/src/main/java/com/ossean/util/StringHandler.java
@@ -50,7 +50,6 @@ public class StringHandler {
}
}
resultStrArr = tmp.toArray(new String[0]);
-//System.out.println("bangbangbang"+resultStrArr[0]+" "+resultStrArr[1]);
return resultStrArr;
}
@@ -289,13 +288,18 @@ public class StringHandler {
}
public static boolean getFullName(String shortName, String synonym,int a){
-
//查看当前的同义词是不是含不含有全称
String[] words = synonym.toLowerCase().split(" ");
String shortenWord = "";
for(int i = 0; i < words.length; i++){
String word = words[i];
- String tmp = word.substring(0, 1);//取出第一个英文字符
+ String tmp = "";
+ if(word.length() == 1)
+ tmp = word;
+ else
+ if(word.length() > 0)
+ tmp = word.substring(0, 1);//取出第一个英文字符
+
shortenWord += tmp;
}
@@ -358,26 +362,22 @@ public class StringHandler {
public static boolean specialIsFullOrShort(String shortName, String fullName)
{
- //System.out.println("进入了特殊别名提取函数");
//首先获取两个字符串中单词的
String[] words_fullName= fullName.toLowerCase().split(" ");
String[] words_shortName = shortName.toLowerCase().split(" ");
if(words_fullName.length==1 && words_shortName.length==1)
{
- //System.out.println("两者长度都大于1");
return false;
}
if(words_shortName.length == 1 && words_fullName.length > 1 )
{
- //System.out.println("fullName的长度大于1");
if(specialIsFullOrShorDetail(shortName,fullName))
return true;
}
else
{
- //System.out.println("shortName长度大于1");
if(specialIsFullOrShorDetail(fullName,shortName))
return true;
}
@@ -397,10 +397,15 @@ public class StringHandler {
//要求全写中各个单词的首字母必须在缩写中出现而且是顺序出现或最多有一个不出现在缩写中。
for(int i = 0; i < words_fullName.length; i++){
String word = words_fullName[i].toLowerCase();
- String tmp = word.substring(0, 1);//取出第一个英文字符
+ String tmp = "";//取出第一个英文字符
+ if(word.length() == 1)
+ tmp = word;
+ else
+ if(word.length() > 0)
+ tmp = word.substring(0, 1);//取出第一个英文字符
if(shortName.toLowerCase().contains(tmp))
{
- //System.out.println("进入了详细字符串比对函数");
+
// tmpFlag = shortName.indexOf(tmp);
// if(tmpFlag > positionFlag)
// {
@@ -412,11 +417,10 @@ public class StringHandler {
/**
如果满足全写中各个单词的首字母都在缩写中顺序出现就接着
- 判断缩写中所有字幕是否都在写中出现
+ 判断缩写中所有字幕是否都在全写中出现
*/
if(matchCount > words_fullName.length-2)
{
- //System.out.println("匹配个数与全写中单词个数相等");
if(isAllContained(shortName,fullName))
return true;
}
@@ -540,4 +544,50 @@ public class StringHandler {
return shortName;//表示描述信息中出现了缩写形式
return "";
}
+
+ public static boolean specialIsFullOrShortForGitPrj(String shortName, String fullName)
+ {
+ String[] splitFullName = null;
+ int containedNum = 0;//记录shortName中包含在fullName中的单词的个数
+
+ if(fullName.indexOf("_") > -1)
+ splitFullName = fullName.split("-");
+ if(fullName.indexOf("-") > -1)
+ splitFullName = fullName.split("_");
+ else
+ splitFullName = fullName.split(" ");
+
+ if(splitFullName != null)
+ for(String tmp:splitFullName)
+ if(shortName.contains(tmp))
+ containedNum++;
+ if(containedNum > 2)
+ return false;
+
+ if(fullName.contains(shortName) || shortName.contains(fullName))
+ return false;
+ //首先获取两个字符串中单词的
+ String[] words_fullName= fullName.toLowerCase().split(" ");
+ String[] words_shortName = shortName.toLowerCase().split(" ");
+
+ if(words_fullName.length==1 && words_shortName.length==1)
+ return false;
+
+ if(words_shortName.length == 1 && words_fullName.length > 1 )
+ {
+ if(specialIsFullOrShorDetail(shortName,fullName))
+ {
+ return true;
+ }
+ }
+ else
+ {
+ if(specialIsFullOrShorDetail(fullName,shortName))
+ {
+ return true;
+ }
+ }
+
+ return false;
+ }
}
diff --git a/project_match/src/main/java/com/ossean/util/TransferProjectsUtil2.java b/project_match/src/main/java/com/ossean/util/TransferProjectsUtil2.java
index 4b04d91..727e5bf 100644
--- a/project_match/src/main/java/com/ossean/util/TransferProjectsUtil2.java
+++ b/project_match/src/main/java/com/ossean/util/TransferProjectsUtil2.java
@@ -16,6 +16,7 @@ import javax.annotation.Resource;
import org.apache.log4j.Logger;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Qualifier;
+import org.springframework.dao.DuplicateKeyException;
import org.springframework.stereotype.Component;
import org.springframework.transaction.annotation.Propagation;
import org.springframework.transaction.annotation.Transactional;
@@ -202,13 +203,24 @@ public class TransferProjectsUtil2 {
//logger.info("!!!!!!!!"+gather_projects_ids);
//logger.info(osp.getId());
OpenSourceProject formerOsp = getFormerOsp(gather_projects_ids);
+ try{
if(formerOsp.getId() != osp.getId() && formerOsp.getFiltration()==2)
osp.setFiltration(1);
+ }catch(Exception e){
+ System.out.println(gather_projects_ids);
+ }
+
dbDest.deleteOpenSourceProjectsItem(openSourceProjectTableName,formerOsp.getId());
dbSource.deleteTaggingsByTaggableId(taggingTableName, formerOsp.getId()); //由于没有级联删除 因此需要手动删除
dbDest.deleteMatchResult(formerOsp.getId(), getTargetTable(formerOsp.getId()));
}
+ //try{
dbDest.insertOsp(openSourceProjectTableName, osp);//将数据插入open_source_projects表中
+ //}catch (DuplicateKeyException e){
+ // System.out.println(isUpdate);
+ // System.out.println(e);
+ //}
+
int osp_id = osp.getId();
String tags = osp.getTags();//tag分离
if(tags != null){//表示该项目没有标签
@@ -422,8 +434,14 @@ public class TransferProjectsUtil2 {
* 如果全部相同 则不进行后续操作
*/
public OpenSourceProject getFormerOsp(String gather_projects_ids){
- String[] idArray = gather_projects_ids.split(",");
OpenSourceProject osp_former = null;
+ if(gather_projects_ids.indexOf(",")<0){
+ osp_former = dbDest.selectOpenSourceProjectsItem(openSourceProjectTableName, Integer.parseInt(gather_projects_ids));
+ return osp_former;
+ }
+
+ String[] idArray = gather_projects_ids.split(",");
+
for(String id_str:idArray){
//查找对应osp记录
int id = Integer.parseInt(id_str);
diff --git a/project_match/src/main/resource/applicationContext-myBatis.xml b/project_match/src/main/resource/applicationContext-myBatis.xml
index 1221373..89374ce 100644
--- a/project_match/src/main/resource/applicationContext-myBatis.xml
+++ b/project_match/src/main/resource/applicationContext-myBatis.xml
@@ -19,9 +19,9 @@
destroy-method="close">
+ value="jdbc:mysql://localhost/ossean_production?characterEncoding=UTF-8&zeroDateTimeBehavior=convertToNull&autoReconnect=true" />
-
+
@@ -38,9 +38,9 @@
destroy-method="close">
+ value="jdbc:mysql://localhost:3306/ossean_production?characterEncoding=UTF-8&zeroDateTimeBehavior=convertToNull&autoReconnect=true" />
-
+