This commit is contained in:
nigel 2017-02-13 09:27:44 +08:00
commit 4e3ff50396
9 changed files with 243 additions and 68 deletions

View File

@ -16,7 +16,6 @@ public class ClearTable {
Logger logger = Logger.getLogger(ClearTable.class);
public void start(){
truncateTable(TableName.eddRelationTableName);
//updateFlag(TableName.synonymsTableName, null);
updateMark(TableName.gatherProjectsTableName,1);
deleteItemInEddPointers(TableName.pointerTableName,TableName.gatherProjectsTableName, TableName.eddRelationTableName);
logger.info("完成去重程序运行前数据表的清空");
@ -117,7 +116,7 @@ public class ClearTable {
Connection conn = null;
try {
Class.forName("com.mysql.jdbc.Driver").newInstance();
conn = DriverManager.getConnection("jdbc:mysql://localhost:3306/ossean_production", "root", "NUDTpdl@123");
conn = DriverManager.getConnection(TableName.connection, TableName.user, TableName.psw);
} catch (InstantiationException e) {
e.printStackTrace();
} catch (IllegalAccessException e) {

View File

@ -81,7 +81,7 @@ public class ClearTableBeforTransfer {
Connection conn = null;
try {
Class.forName("com.mysql.jdbc.Driver").newInstance();
conn = DriverManager.getConnection("jdbc:mysql://localhost:3306/ossean_production", "root", "NUDTpdl@123");
conn = DriverManager.getConnection(TableName.connection,TableName.user, TableName.psw);
} catch (InstantiationException e) {
e.printStackTrace();
} catch (IllegalAccessException e) {

View File

@ -51,6 +51,7 @@ public class MergeProjects {
}
public void start(){
logger.info("start remove projects!");
int count=0;
count = readPointer(pointerTableName,sourceTableName,targetTableName, count);//指针表计量处理的项目数
long start = System.currentTimeMillis();
@ -73,7 +74,7 @@ public class MergeProjects {
//logger.info("Duplicate remove:handling project " + model.getId());
haveRemovedSet.addAll(mergeProjectNew2.handleNewProject(model));
}
dbSource.updatePointer(pointerTableName, sourceTableName, targetTableName, haveRemovedSet.size());
dbSource.updatePointer(pointerTableName, sourceTableName, targetTableName, count+haveRemovedSet.size());
if(haveRemovedSet.size()%10000==0)
logger.warn("deal with:"+haveRemovedSet.size()+" projects cost: "+(float)(System.currentTimeMillis() - start)/60000+" minutes");
}
@ -87,7 +88,7 @@ public class MergeProjects {
MergeProjects Main = applicationContext.getBean(MergeProjects.class);
GetSynonyms getSynonyms = (GetSynonyms) applicationContext.getBean("getSynonyms");
getSynonyms.start();
System.out.println("insertSynonyms thread start~");
System.out.println("insertSynonyms thread start!");
Main.start();
}
}

View File

@ -8,7 +8,15 @@ public class TableName {
public static String taggingsTableName = "taggings";
public static String tagsTableName = "tags";
public static String eddRelationTableName = "edd_relations_17";
public static String openSourceProjectsTableName = "open_source_projects";
public static String openSourceProjectsTableName = "open_source_projects_17";
public static String sourceforgeTableName = "sourceforge_project";
public static String openhubTableName = "openhub_project";
public static String oschinaTableName = "oschina_project";
public static String connection = "jdbc:mysql://localhost:3306/ossean_production";
public static String user = "root";
public static String psw = "NUDTpdl@123";
}

View File

@ -10,13 +10,13 @@ import com.ossean.model.SourceForgeProject;
public interface PlatformProjectDao {
@Select("select name,rate_num,user_num from openhub_project where url = #{url} group by url_md5 order by extracted_time desc")
public OpenhubProject getOpenHubPrjByUrl( @Param("url") String url);
@Select("select name,rate_num,user_num from ${table} where url = #{url} group by url_md5 order by extracted_time desc")
public OpenhubProject getOpenHubPrjByUrl( @Param("table") String table,@Param("url") String url);
@Select("select name,download_num,favor_num from sourceforge_project where url = #{url} group by url_md5 order by extracted_time desc")
public SourceForgeProject getSourceForgePrjByUrl( @Param("url") String url);//oschina_project
@Select("select name,download_num,favor_num from ${table} where url = #{url} group by url_md5 order by extracted_time desc")
public SourceForgeProject getSourceForgePrjByUrl(@Param("table") String table, @Param("url") String url);//oschina_project
@Select("select name,star_num,user_num from sourceforge_project where url = #{url} group by url_md5 order by extracted_time desc")
public OschinaProject getOschinaPrjByUrl(@Param("url") String url);
@Select("select name,star_num,user_num from ${table} where url = #{url} group by url_md5 order by extracted_time desc")
public OschinaProject getOschinaPrjByUrl(@Param("table") String table,@Param("url") String url);
}

View File

@ -55,8 +55,11 @@ public class MergeProjectNew2 {
private String synonymsTableName = TableName.synonymsTableName;
private String gatherProjectsTableName = TableName.gatherProjectsTableName;
private String eddRelationTableName = TableName.eddRelationTableName;
private String sourceforgeTableName = TableName.sourceforgeTableName;
private String openhubTableName = TableName.openhubTableName;
private String oschinaTableName = TableName.oschinaTableName;
private static Set<Integer> passSet;
//private static Set<Integer> passSet;
private static Set<Integer> relationSet;
public List<String> getSynonyms(GatherProjectsModel model){
@ -67,15 +70,19 @@ public class MergeProjectNew2 {
//处理新项目
public Set<Integer> handleNewProject(GatherProjectsModel model){
passSet = new HashSet<Integer>();//已经处理的项目避免重复处理陷入死锁
if(model.getUpdate_mark()==2)
return null;
//passSet = new HashSet<Integer>();//已经处理的项目避免重复处理陷入死锁
relationSet = new HashSet<Integer>();//所有重复的项目集合
relationSet = findDupProject(model,relationSet,passSet);
relationSet = findDupProject(model,relationSet);
OperateTable(relationSet, model.getId());//表操作
return relationSet;
}
public Set<Integer> findDupProject(GatherProjectsModel model,Set<Integer> relationSet,Set<Integer> passSet){
public Set<Integer> findDupProject(GatherProjectsModel model,Set<Integer> relationSet){
if(gatherDao.selectGPMById(gatherProjectsTableName, model.getId()).getUpdate_mark()==2)
return relationSet;
String prjName = model.getName();
int prjId = model.getId();
String prjHomepage = model.getHomepage();
@ -88,11 +95,17 @@ public class MergeProjectNew2 {
if(prjHomepage == null || prjHomepage.equals(""))
if(model.getSource().equals("apache"))
prjHomepage = model.getUrl();
// if(model.getSource().equals("openhub"))
// if(!isRightUrlByRegex(model.getHomepage())){
// prjHomepage=null;
// }
//homepage相同认为一定是相同的项目
//暂时性对openhub项目进行处理
if(model.getSource().equals("openhub")){
if(!isRightUrlByRegex(model.getHomepage())||includeSubStringNum(model.getHomepage(), "http")>1){
prjHomepage=null;
}
}
//处理homepage相同的项目分别对和当前处理项目同一社区的不同社区的进行处理
List<GatherProjectsModel> sameHomeAndSourceList = new ArrayList<GatherProjectsModel>();
sameHomeAndSourceList.add(model);
Map<String,List<GatherProjectsModel>> sameHomeMap = new HashMap<String, List<GatherProjectsModel>>();
if(prjHomepage != null && !prjHomepage.equals("")){
prjHomepage = deleteHttpPre(prjHomepage);//去掉http&https前缀及/后缀
if(prjHomepage.length() > 0){
@ -104,45 +117,90 @@ public class MergeProjectNew2 {
sameHomepageList = gatherDao.selectGPMBySameHomePage(gatherProjectsTableName, prjHomepage, homepage1, homepage2, homepage3, homepage4, model.getId(),1);
for(GatherProjectsModel m : sameHomepageList){
//logger.info("project " + model.getId() + " and " + m.getId() + " match by homepage");
if(!passSet.contains(m.getId()))
matchedList.add(m);
if(!m.getSource().equals(model.getSource())){
if(sameHomeMap.get(m.getSource())==null){
sameHomeMap.put(m.getSource(),new ArrayList<GatherProjectsModel>());
sameHomeMap.get(m.getSource()).add(m);
}
else
sameHomeMap.get(m.getSource()).add(m);
}
else{
sameHomeAndSourceList.add(m);
}
}
//同一社区homepage相同则选出最有影响力的其余插表更改当前处理的model指向
if(sameHomeAndSourceList.size() != 0){
GatherProjectsModel uniqueModel = getUniqueSourceModel(sameHomeAndSourceList);
for (GatherProjectsModel tmp : sameHomeAndSourceList){
if(!tmp.equals(uniqueModel)){
//插入关系表
String relationStr = ","+tmp.getId()+",";
dbSource.insertEddRelations(eddRelationTableName, relationStr);
gatherDao.updateMark(gatherProjectsTableName, 2, tmp.getId());
}
}
model = uniqueModel;
}
//处理查找的项目有同一source的情况确保一个source最多只有一个项目有相同homepage
for(String uniqueSource:sameHomeMap.keySet()){
GatherProjectsModel uniqueModel = getUniqueSourceModel(sameHomeMap.get(uniqueSource));
if(!relationSet.contains(uniqueModel.getId()))
matchedList.add(uniqueModel);
for (GatherProjectsModel tmp : sameHomeMap.get(uniqueSource)){
if(!tmp.equals(uniqueModel)){
//插入关系表
String relationStr = ","+tmp.getId()+",";
dbSource.insertEddRelations(eddRelationTableName, relationStr);
gatherDao.updateMark(gatherProjectsTableName, 2, tmp.getId());
}
}
}
}
}
//处理同名项目
List<GatherProjectsModel> sameNamePrjList = gatherDao.selectGPMBySameName(gatherProjectsTableName, prjName, prjId,1);
List<GatherProjectsModel> afterNamePrjList = new ArrayList<GatherProjectsModel>();
for(int i = 0;i<sameNamePrjList.size();i++){
List<Integer> afterNamePrjIdList = new ArrayList<Integer>();
for(int i = 0;i<sameNamePrjList.size();i++){//不重复比较
//logger.info("project " + model.getId() + " and " + sameNamePrjList.get(i).getId() + " match by name");
if(!passSet.contains(sameNamePrjList.get(i).getId()))//未处理的
if(!relationSet.contains(sameNamePrjList.get(i).getId())){//未处理的
afterNamePrjList.add(sameNamePrjList.get(i));
}
}
afterNamePrjList = returnDupModelList(model,afterNamePrjList,true);
for(GatherProjectsModel a:afterNamePrjList){
afterNamePrjIdList.add(a.getId());
}
//处理别名相同的项目
List<String> synonymList = getSynonyms(model);//获取项目的同义词
List<GatherProjectsModel> synonymsPrjList = getRelatedGatherProjects(model,synonymList);
List<GatherProjectsModel> afterSynonymsPrjList = new ArrayList<GatherProjectsModel>();
for(int j = 0;j < synonymsPrjList.size();j++){
for(int j = 0;j < synonymsPrjList.size();j++){//不重复比较
//logger.info("project " + model.getId() + " and " + synonymsPrjList.get(j).getId() + " match by synonyms");
if(!passSet.contains(synonymsPrjList.get(j).getId()) && !afterNamePrjList.contains(model))//未处理的,也不同名否则会重复计算
if(!relationSet.contains(synonymsPrjList.get(j).getId()) && !afterNamePrjIdList.contains(synonymsPrjList.get(j).getId()))//未处理的,也不同名否则会重复计算
afterSynonymsPrjList.add(synonymsPrjList.get(j));
}
afterSynonymsPrjList = returnDupModelList(model,afterSynonymsPrjList,false);
// for(GatherProjectsModel m : afterSynonymsPrjList){
// logger.info("project " + model.getId() + " and " + m.getId() + " match by synonyms/name and verify");
// }
afterSynonymsPrjList = mergeTwoList4Source(afterNamePrjList,afterSynonymsPrjList);
// for(GatherProjectsModel m : afterSynonymsPrjList){
// logger.info("project " + model.getId() + " and " + m.getId() + " match by synonyms/name and verify");
// }
matchedList = mergeTwoList4Source(matchedList,afterSynonymsPrjList);//同一社区的项目相同homepage比名字相同和别名相同优先级更高
relationSet.add(model.getId());
passSet.add(model.getId());
//passSet.add(model.getId());
if(matchedList.size()==0)
return relationSet;
//递归调用有重复关系的项目
for(int i = 0; i < matchedList.size(); i++){
GatherProjectsModel dupModel = matchedList.get(i);
if(!passSet.contains(dupModel.getId())){
findDupProject(dupModel,relationSet,passSet);
if(!relationSet.contains(dupModel.getId())){
findDupProject(dupModel,relationSet);
}
}
return relationSet;
@ -151,18 +209,53 @@ public class MergeProjectNew2 {
//对于重复关系的项目标记update_mark=2之后不再重复操作
@Transactional(propagation=Propagation.REQUIRED)
public void OperateTable(Set<Integer> relationSet,int prjId){
List<String> dupSynonymsList = new ArrayList<String>();
String relationStr = ",";
String finallyRelationStr = ",";
List<GatherProjectsModel> mergeList = new ArrayList<GatherProjectsModel>();
Map<String,List<GatherProjectsModel>> map = new HashMap<String, List<GatherProjectsModel>>();
for (int id:relationSet) {//该集合包括处理项目本身
relationStr = relationStr+id+",";
dupSynonymsList.addAll(dbSource.getSynonymByPrjId(synonymsTableName, id));//获取flag=1的同义词确定的
gatherDao.updateMark(gatherProjectsTableName, 2, id);
mergeList.add(gatherDao.selectGPMById(gatherProjectsTableName, id));
}
dupSynonymsList = removeDupStr(dupSynonymsList);
for(String synonyms:dupSynonymsList){
dbSource.insertSynonymsNew(synonymsTableName,prjId,synonyms,1);
for(GatherProjectsModel model1:mergeList){
if(map.get(model1.getSource())==null){
map.put(model1.getSource(),new ArrayList<GatherProjectsModel>());
map.get(model1.getSource()).add(model1);
}
else
map.get(model1.getSource()).add(model1);
}
for(String uniqueSource:map.keySet()){
GatherProjectsModel uniqueModel = getUniqueSourceModel(map.get(uniqueSource));
for (GatherProjectsModel tmp : map.get(uniqueSource)){
if(!tmp.equals(uniqueModel)){
//插入关系表
String relationStr = ","+tmp.getId()+",";
dbSource.insertEddRelations(eddRelationTableName, relationStr);
gatherDao.updateMark(gatherProjectsTableName, 2, tmp.getId());
}
else
gatherDao.updateMark(gatherProjectsTableName, 2, tmp.getId());
}
finallyRelationStr = finallyRelationStr + uniqueModel.getId()+",";
}
// List<String> dupSynonymsList = new ArrayList<String>();
// for (int id:relationSet) {//该集合包括处理项目本身
// relationStr = relationStr+id+",";
// dupSynonymsList.addAll(dbSource.getSynonymByPrjId(synonymsTableName, id));//获取flag=1的同义词确定的
// gatherDao.updateMark(gatherProjectsTableName, 2, id);
// }
// dupSynonymsList = removeDupStr(dupSynonymsList);
// for(String synonyms:dupSynonymsList){
// dbSource.insertSynonymsNew(synonymsTableName,prjId,synonyms,1);
// }
try {
dbSource.insertEddRelations(eddRelationTableName, finallyRelationStr);
} catch (Exception e) {
e.printStackTrace();
logger.info(finallyRelationStr);
}
dbSource.insertEddRelations(eddRelationTableName, relationStr);
}
//找到当前项目同义词关联的项目 要求项目不相同 因为同一个项目可能包含多个同义词
@ -212,6 +305,14 @@ public class MergeProjectNew2 {
}
for(String uniqueSource:map.keySet()){
uniqueModel = getUniqueSourceModel(map.get(uniqueSource));
for (GatherProjectsModel tmp : map.get(uniqueSource)){
if(!tmp.equals(uniqueModel)){
//插入关系表
String relationStr = ","+tmp.getId()+",";
dbSource.insertEddRelations(eddRelationTableName, relationStr);
gatherDao.updateMark(gatherProjectsTableName, 2, tmp.getId());
}
}
afterReturnList.add(uniqueModel);
}
return afterReturnList;
@ -225,30 +326,48 @@ public class MergeProjectNew2 {
returnModel = list.get(0);
else{
GatherProjectsModel tmp = list.get(0);
for(GatherProjectsModel model1:list){
if(model1.getUrl()==null || model1.getUrl().equals(""))
continue;
if(source.equals("apache")||source.equals("freecode"))
returnModel = list.get(0);
// if(source.equals("oschina")){
// int a = platformProjectDao.getOschinaPrjByUrl(model1.getUrl()).getStar_num();
// int b = platformProjectDao.getOschinaPrjByUrl(tmp.getUrl()).getStar_num();
// if(a > b){
// tmp = model1;
// }
// }
if(source.equals("oschina")){
try {
int a = platformProjectDao.getOschinaPrjByUrl(oschinaTableName,model1.getUrl()).getStar_num();
int b = platformProjectDao.getOschinaPrjByUrl(oschinaTableName,tmp.getUrl()).getStar_num();
if(a > b){
tmp = model1;
}
} catch (Exception e) {
System.out.println(model1.getSource()+" ==== "+model1.getUrl());
System.out.println(tmp.getSource()+" ==== "+tmp.getUrl());
}
}
if(source.equals("openhub")){
int a = platformProjectDao.getOpenHubPrjByUrl(model1.getUrl()).getUser_num();
int b = platformProjectDao.getOpenHubPrjByUrl(tmp.getUrl()).getUser_num();
if(a > b){
tmp = model1;
try {
int a = platformProjectDao.getOpenHubPrjByUrl(openhubTableName,model1.getUrl()).getUser_num();
int b = platformProjectDao.getOpenHubPrjByUrl(openhubTableName,tmp.getUrl()).getUser_num();
if(a > b){
tmp = model1;
}
} catch (Exception e) {
System.out.println(model1.getSource()+" ==== "+model1.getUrl());
System.out.println(tmp.getSource()+" ==== "+tmp.getUrl());
}
}
if(source.equals("sourceforge")){
int a = platformProjectDao.getSourceForgePrjByUrl(model1.getUrl()).getDownload_num();
int b = platformProjectDao.getSourceForgePrjByUrl(tmp.getUrl()).getDownload_num();
if(a > b){
tmp = model1;
try{
int a = platformProjectDao.getSourceForgePrjByUrl(sourceforgeTableName,model1.getUrl()).getDownload_num();
int b = platformProjectDao.getSourceForgePrjByUrl(sourceforgeTableName,tmp.getUrl()).getDownload_num();
if(a > b){
tmp = model1;
}
}
catch(Exception e){
System.out.println(model1.getSource()+" ==== "+model1.getUrl());
System.out.println(tmp.getSource()+" ==== "+tmp.getUrl());
}
}
}
returnModel = tmp;
@ -261,6 +380,19 @@ public class MergeProjectNew2 {
double similarity;
int similarTagNum;
boolean isEqual=false;
if(model1.getSource().equals(model2.getSource()))
return false;
//sourceforge描述信息为空时默认为No description...汇总应该直接处理为空
if(model1.getSource().equals("sourceforge")){
if(model1.getDescription()!=null && !model1.getDescription().equals(""))
if(model1.getDescription().equals("No description "+model1.getName()+" Web Site"))
model1.setDescription(null);
}
if(model2.getSource().equals("sourceforge")){
if(model2.getDescription()!=null && !model2.getDescription().equals(""))
if(model2.getDescription().equals("No description "+model2.getName()+" Web Site"))
model2.setDescription(null);
}
//oschina的tag和description都为中文
if(model1.getSource().equals("oschina")||model2.getSource().equals("oschina")){
if(sameName){
@ -294,8 +426,8 @@ public class MergeProjectNew2 {
return true;
else{
similarity = calSimilarity(model1, model2);
//System.out.println("same synonyms to calSimilarityByDescrption between "+model1.getId()+" and "+model2.getId()+" : "+similarity);
if(similarity>=0.4)
//System.out.println("same synonyms to calSimilarityByDescrption between "+model1.getId()+" and "+model2.getId()+" : "+similarity);
if(similarity>=0.5)
return true;
else
return false;
@ -305,6 +437,8 @@ public class MergeProjectNew2 {
}
public double calSimilarity(GatherProjectsModel model1, GatherProjectsModel model2){
if(model1.getDescription()==null || model1.getDescription().equals("")||model2.getDescription()==null ||model2.getDescription().equals(""))
return 0;
NGram ngram = new NGram(4);
return (1-ngram.distance(model1.getDescription(),model2.getDescription()));
}
@ -422,5 +556,31 @@ public class MergeProjectNew2 {
return true;
}
return false;
}
}
public int includeSubStringNum(String str,String subStr){
int count = 0;
if (str.indexOf(subStr)==-1)
{
return 0;
}
else if(str.indexOf(subStr) != -1)
{
count++;
count += includeSubStringNum(str.substring(str.indexOf(subStr)+subStr.length()),subStr);
return count;
}
return 0;
}
public static void main(String [] args){
ApplicationContext applicationContext = new ClassPathXmlApplicationContext("classpath:/applicationContext*.xml");
MergeProjectNew2 Main = applicationContext.getBean(MergeProjectNew2.class);
Main.test();
}
public void test(){
GatherProjectsModel model = gatherDao.selectGPMById(gatherProjectsTableName, 346);
handleNewProject(model);
}
}

View File

@ -160,7 +160,14 @@ public class TransferProjectsUtil2 {
//logger.info("处理edd_relations表数据id" + relation.getId());
List<GatherProjectsModel> gatherProjectList = new ArrayList<GatherProjectsModel>();//最热门项目要放在第一个下标的位置
String gather_projects_ids = relation.getGather_projects_ids();
gather_projects_ids = gather_projects_ids.substring(1, gather_projects_ids.length() - 1);
if(gather_projects_ids.equals(","))
return;
try {
gather_projects_ids = gather_projects_ids.substring(1, gather_projects_ids.length() - 1);
} catch (Exception e1) {
System.out.println(relation.getGather_projects_ids());
e1.printStackTrace();
}
String[] idsArray = gather_projects_ids.split(",");
boolean error_lost = false;
for(int i = 0; i < idsArray.length; i++){

View File

@ -62,8 +62,8 @@ public class URLAvailability {
public static void main(String[] args) {
//URLAvailability u = new URLAvailability();
System.out.println(isConnect2("http://www.oschina.nethttps://market.aliyun.com/software?spm=5176.1846502.0.0.CpQnwc"));
//System.out.println(isRightUrlByRegex("https://www.openhub.net/p/tesseract"));
//System.out.println(isConnect2("http://www.oschina.nethttps://market.aliyun.com/software?spm=5176.1846502.0.0.CpQnwc"));
System.out.println(isRightUrlByRegex("https://www.openhub.net/p/tesseracthttp://abc.com"));
//System.out.println(isConnect("http://www.freecode.com/projects/pdfsam"));
}

View File

@ -19,9 +19,9 @@
destroy-method="close">
<property name="driverClassName" value="com.mysql.jdbc.Driver" />
<property name="url"
value="jdbc:mysql://localhost/ossean?characterEncoding=UTF-8&amp;zeroDateTimeBehavior=convertToNull&amp;autoReconnect=true" />
value="jdbc:mysql://localhost/ossean_production?characterEncoding=UTF-8&amp;zeroDateTimeBehavior=convertToNull&amp;autoReconnect=true" />
<property name="username" value="root" />
<property name="password" value="123456" />
<property name="password" value="1234" />
<property name="validationQuery" value="SELECT 1" />
<property name="testOnBorrow" value="true"/>
</bean>
@ -38,9 +38,9 @@
destroy-method="close">
<property name="driverClassName" value="com.mysql.jdbc.Driver" />
<property name="url"
value="jdbc:mysql://localhost:3306/ossean?characterEncoding=UTF-8&amp;zeroDateTimeBehavior=convertToNull&amp;autoReconnect=true" />
value="jdbc:mysql://localhost:3306/ossean_production?characterEncoding=UTF-8&amp;zeroDateTimeBehavior=convertToNull&amp;autoReconnect=true" />
<property name="username" value="root" />
<property name="password" value="123456" />
<property name="password" value="1234" />
<property name="validationQuery" value="SELECT 1" />
<property name="testOnBorrow" value="true"/>
</bean>