From 6b4c829653a8a72e4bb444dfa8cc3f7f400cd8c7 Mon Sep 17 00:00:00 2001 From: waterbeach <1207296860@qq.com> Date: Fri, 17 Feb 2017 16:27:22 +0800 Subject: [PATCH] add handle multihomepage --- .../src/main/java/com/ossean/TableName.java | 2 +- .../src/main/java/com/ossean/TestDate.java | 15 +- .../com/ossean/util/MergeProjectNew2.java | 913 ++++++++++-------- 3 files changed, 515 insertions(+), 415 deletions(-) diff --git a/project_match/src/main/java/com/ossean/TableName.java b/project_match/src/main/java/com/ossean/TableName.java index 9f3bfc4..8a683a6 100644 --- a/project_match/src/main/java/com/ossean/TableName.java +++ b/project_match/src/main/java/com/ossean/TableName.java @@ -8,7 +8,7 @@ public class TableName { public static String taggingsTableName = "taggings"; public static String tagsTableName = "tags"; public static String eddRelationTableName = "edd_relations"; - public static String openSourceProjectsTableName = "open_source_projects_17"; + public static String openSourceProjectsTableName = "open_source_projects"; public static String sourceforgeTableName = "sourceforge_project"; public static String openhubTableName = "openhub_project"; public static String oschinaTableName = "oschina_project"; diff --git a/project_match/src/main/java/com/ossean/TestDate.java b/project_match/src/main/java/com/ossean/TestDate.java index af51055..9f048a5 100644 --- a/project_match/src/main/java/com/ossean/TestDate.java +++ b/project_match/src/main/java/com/ossean/TestDate.java @@ -43,11 +43,24 @@ public class TestDate { // System.out.println(a.toString()); // System.out.println(b); // System.out.println(a.after(b)); + } + + public void start_test(){ + String homepage = "http://github.com/apache/abdera;http://incubator.apache.org/abdera/"; + String homepage1 = "http://incubator.apache.org/abdera/"; + String arr[] = homepage1.split(";"); + System.out.println(arr.length); + for(String a : arr){ + System.out.println(a); + } + + + } public static void main(String[] args) { ApplicationContext applicationContext = new ClassPathXmlApplicationContext("classpath:/applicationContext*.xml"); TestDate Main = applicationContext.getBean(TestDate.class); - Main.start(); + Main.start_test(); // StringBuilder a = new StringBuilder(","); // String b = StringHandler.removeFirstEndComma(a.toString()); // System.out.println(b); diff --git a/project_match/src/main/java/com/ossean/util/MergeProjectNew2.java b/project_match/src/main/java/com/ossean/util/MergeProjectNew2.java index 9f58b48..a74c5dd 100644 --- a/project_match/src/main/java/com/ossean/util/MergeProjectNew2.java +++ b/project_match/src/main/java/com/ossean/util/MergeProjectNew2.java @@ -59,635 +59,722 @@ public class MergeProjectNew2 { private String sourceforgeTableName = TableName.sourceforgeTableName; private String openhubTableName = TableName.openhubTableName; private String oschinaTableName = TableName.oschinaTableName; - //private static GatherProjectsModel gitModel = null;//gitModel表示重复关系的github项目中最有影响力的Model;先不加入关系中 - - + // private static GatherProjectsModel gitModel = + // null;//gitModel表示重复关系的github项目中最有影响力的Model;先不加入关系中 + private static Set removeSet; private static Set relationSet; private static int handleCount; - - public List getSynonyms(GatherProjectsModel model){ - List synonymsList = new ArrayList();//最终返回的同义词列表 - synonymsList = dbSource.getSynonymByPrjId(synonymsTableName, model.getId()); - return synonymsList; + + public List getSynonyms(GatherProjectsModel model) { + List synonymsList = new ArrayList();// 最终返回的同义词列表 + synonymsList = dbSource.getSynonymByPrjId(synonymsTableName, + model.getId()); + return synonymsList; } - //处理新项目 - - public int handleNewProject(GatherProjectsModel model,boolean isIncrese){ - - if(isIncrese){ - logger.info("model "+model.getId()+" is increase data"); + + // 处理新项目 + + public int handleNewProject(GatherProjectsModel model, boolean isIncrese) { + + if (isIncrese) { + logger.info("model " + model.getId() + " is increase data"); } - //已缓存的列表中的数据,若已处理 则不重复处理 - if(gatherDao.selectGPMById(gatherProjectsTableName, model.getId()).getUpdate_mark()==2) + // 已缓存的列表中的数据,若已处理 则不重复处理 + if (gatherDao.selectGPMById(gatherProjectsTableName, model.getId()) + .getUpdate_mark() == 2) return 0; - relationSet = new HashSet();//有重复关系的项目集合 - removeSet = new HashSet();//已经处理的项目,包括单独插入的和有重复关系的 - relationSet = findDupProject(model,relationSet,isIncrese); -// if(gitModel!=null) -// relationSet.add(gitModel.getId()); - OperateTable(relationSet,isIncrese);//表操作 + relationSet = new HashSet();// 有重复关系的项目集合 + removeSet = new HashSet();// 已经处理的项目,包括单独插入的和有重复关系的 + relationSet = findDupProject(model, relationSet, isIncrese); + // if(gitModel!=null) + // relationSet.add(gitModel.getId()); + OperateTable(relationSet, isIncrese);// 表操作 return removeSet.size(); } - - //isIncrese=true对应增量数据,应扫全表判断 - public Set findDupProject(GatherProjectsModel model,Set relationSet,boolean isIncrese){ - //递归时检查 - if(gatherDao.selectGPMById(gatherProjectsTableName, model.getId()).getUpdate_mark()==2) + + // isIncrese=true对应增量数据,应扫全表判断 + public Set findDupProject(GatherProjectsModel model, + Set relationSet, boolean isIncrese) { + // 递归时检查 + if (gatherDao.selectGPMById(gatherProjectsTableName, model.getId()) + .getUpdate_mark() == 2) return relationSet; String prjName = model.getName(); int prjId = model.getId(); String prjHomepage = model.getHomepage(); - - //项目重复匹配列表 + // apache社区没有homepage,用url代替 + if (prjHomepage == null || prjHomepage.equals("")) + if (model.getSource().equals("apache")) + prjHomepage = model.getUrl(); + // 暂时性对openhub项目进行处理 + // if(model.getSource().equals("openhub")){ + // if(model.getHomepage()!=null && !model.getHomepage().equals("")) + // if(!isRightUrlByRegex(model.getHomepage())||includeSubStringNum(model.getHomepage(), + // "http")>1){ + // prjHomepage=null; + // } + // } + + // 项目重复匹配列表 List matchedList = new ArrayList(); List sameHomepageList = new ArrayList(); - - //apache社区没有homepage,用url代替 - if(prjHomepage == null || prjHomepage.equals("")) - if(model.getSource().equals("apache")) - prjHomepage = model.getUrl(); - //暂时性对openhub项目进行处理 - if(model.getSource().equals("openhub")){ - if(model.getHomepage()!=null && !model.getHomepage().equals("")) - if(!isRightUrlByRegex(model.getHomepage())||includeSubStringNum(model.getHomepage(), "http")>1){ - prjHomepage=null; - } - } - - //处理homepage相同的项目,分别对和当前处理项目同一社区的、不同社区的进行处理 + // 处理homepage相同的项目,分别对和当前处理项目同一社区的、不同社区的进行处理 List sameHomeAndSourceList = new ArrayList(); List matchedIdList = new ArrayList(); sameHomeAndSourceList.add(model); - Map> sameHomeMap = new HashMap>(); - if(prjHomepage != null && !prjHomepage.equals("")){ - prjHomepage = deleteHttpPre(prjHomepage);//去掉“http&https”前缀及“/”后缀 - if(prjHomepage.length() > 0){ - //为homepage添加http或https头 - String homepage1 = "http://" + prjHomepage; - String homepage2 = "https://" + prjHomepage; - String homepage3 = "http://" + prjHomepage + "/"; - String homepage4 = "https://" + prjHomepage + "/"; - if(isIncrese) - sameHomepageList = gatherDao.selectGPMBySameHomePageForIncrease(gatherProjectsTableName, prjHomepage, homepage1, - homepage2, homepage3, homepage4, model.getId()); - else - sameHomepageList = gatherDao.selectGPMBySameHomePage(gatherProjectsTableName, prjHomepage, homepage1, - homepage2, homepage3, homepage4, model.getId(),1); - for(GatherProjectsModel m : sameHomepageList){ - //logger.info("project " + model.getId() + " and " + m.getId() + " match by homepage"); - if(!m.getSource().equals(model.getSource())){ - if(sameHomeMap.get(m.getSource())==null){ - sameHomeMap.put(m.getSource(),new ArrayList()); - sameHomeMap.get(m.getSource()).add(m); - } - else - sameHomeMap.get(m.getSource()).add(m); - } - else{ - sameHomeAndSourceList.add(m); - } + + Map> sameHomeMap = new HashMap>(); + if (prjHomepage != null && !prjHomepage.equals("")) { + String[] homepageArr = prjHomepage.split(";"); + for (String homepage : homepageArr) { + homepage = deleteHttpPre(homepage);// 去掉“http&https”前缀及“/”后缀 + if (homepage.length() > 0) { + // 为homepage添加http或https头 + String homepage1 = "http://" + homepage; + String homepage2 = "https://" + homepage; + String homepage3 = "http://" + homepage + "/"; + String homepage4 = "https://" + homepage + "/"; + if (isIncrese) + sameHomepageList.addAll(gatherDao + .selectGPMBySameHomePageForIncrease( + gatherProjectsTableName, homepage, + homepage1, homepage2, homepage3, + homepage4, model.getId())); + else + sameHomepageList.addAll(gatherDao + .selectGPMBySameHomePage( + gatherProjectsTableName, homepage, + homepage1, homepage2, homepage3, + homepage4, model.getId(), 1)); } - //同一社区,homepage相同,则选出最有影响力的,其余插表。 - if(sameHomeAndSourceList.size() != 0){ - GatherProjectsModel uniqueModel = getUniqueSourceModel(sameHomeAndSourceList); - for (GatherProjectsModel tmp : sameHomeAndSourceList){ - if(tmp.getId()!=uniqueModel.getId() && tmp.getId()!=model.getId() && tmp.getUpdate_mark()!=2){ - //插入关系表 - String relationStr = ","+tmp.getId()+","; - dbSource.insertEddRelations(eddRelationTableName, relationStr); - gatherDao.updateMark(gatherProjectsTableName, 2, tmp.getId()); - //handleCount++; - removeSet.add(tmp.getId()); - } - } - if(model.getId()!=uniqueModel.getId()){ - String relationStr = ","+model.getId()+","; - relationSet.add(model.getId()); - return relationSet; - }else{ -// if(uniqueModel.getSource().equals("github")){ -// if(gitModel!=null) -// if(compareGithubProject(uniqueModel,gitModel)) -// gitModel = uniqueModel; -// } - } - - - } - //处理查找的项目有同一source的情况,确保一个source最多只有一个项目有相同homepage - for(String uniqueSource:sameHomeMap.keySet()){ - GatherProjectsModel uniqueModel = getUniqueSourceModel(sameHomeMap.get(uniqueSource)); -// if(uniqueModel.getSource().equals("github")){ -// if(gitModel!=null) -// if(compareGithubProject(uniqueModel,gitModel)) -// gitModel = uniqueModel; -// } - if(!relationSet.contains(uniqueModel.getId()) && !uniqueModel.getSource().equals("github")){ - matchedList.add(uniqueModel); - matchedIdList.add(uniqueModel.getId()); - } - for (GatherProjectsModel tmp : sameHomeMap.get(uniqueSource)){ - if(!tmp.equals(uniqueModel) && tmp.getUpdate_mark()!=2){ - //插入关系表 - String relationStr = ","+tmp.getId()+","; - dbSource.insertEddRelations(eddRelationTableName, relationStr); - gatherDao.updateMark(gatherProjectsTableName, 2, tmp.getId()); - //handleCount++; - removeSet.add(tmp.getId()); - } - } - } - } + for (GatherProjectsModel m : sameHomepageList) { + // logger.info("project " + model.getId() + " and " + + // m.getId() + " match by homepage"); + if (!m.getSource().equals(model.getSource())) { + if (sameHomeMap.get(m.getSource()) == null) { + sameHomeMap.put(m.getSource(), + new ArrayList()); + sameHomeMap.get(m.getSource()).add(m); + } else + sameHomeMap.get(m.getSource()).add(m); + } else { + sameHomeAndSourceList.add(m); + } + } + // 同一社区,homepage相同,则选出最有影响力的,其余插表。 + if (sameHomeAndSourceList.size() != 0) { + GatherProjectsModel uniqueModel = getUniqueSourceModel(sameHomeAndSourceList); + for (GatherProjectsModel tmp : sameHomeAndSourceList) { + if (tmp.getId() != uniqueModel.getId() + && tmp.getId() != model.getId() + && tmp.getUpdate_mark() != 2) { + // 插入关系表 + String relationStr = "," + tmp.getId() + ","; + dbSource.insertEddRelations(eddRelationTableName, + relationStr); + gatherDao.updateMark(gatherProjectsTableName, 2, + tmp.getId()); + // handleCount++; + removeSet.add(tmp.getId()); + } + } + if (model.getId() != uniqueModel.getId()) { + String relationStr = "," + model.getId() + ","; + relationSet.add(model.getId()); + return relationSet; + } else { + // if(uniqueModel.getSource().equals("github")){ + // if(gitModel!=null) + // if(compareGithubProject(uniqueModel,gitModel)) + // gitModel = uniqueModel; + // } + } + + } + // 处理查找的项目有同一source的情况,确保一个source最多只有一个项目有相同homepage + for (String uniqueSource : sameHomeMap.keySet()) { + GatherProjectsModel uniqueModel = getUniqueSourceModel(sameHomeMap + .get(uniqueSource)); + // if(uniqueModel.getSource().equals("github")){ + // if(gitModel!=null) + // if(compareGithubProject(uniqueModel,gitModel)) + // gitModel = uniqueModel; + // } + if (!relationSet.contains(uniqueModel.getId()) + && !uniqueModel.getSource().equals("github")) { + matchedList.add(uniqueModel); + matchedIdList.add(uniqueModel.getId()); + } + for (GatherProjectsModel tmp : sameHomeMap.get(uniqueSource)) { + if (!tmp.equals(uniqueModel) && tmp.getUpdate_mark() != 2) { + // 插入关系表 + String relationStr = "," + tmp.getId() + ","; + dbSource.insertEddRelations(eddRelationTableName, + relationStr); + gatherDao.updateMark(gatherProjectsTableName, 2, + tmp.getId()); + // handleCount++; + removeSet.add(tmp.getId()); + } + } + } + } - - //处理同名项目;github项目不用处理同名的情况 + + // 处理同名项目;github项目不用处理同名的情况 List afterNamePrjList = new ArrayList(); List afterNamePrjIdList = new ArrayList(); List sameNamePrjList; - if(isIncrese){ - sameNamePrjList = gatherDao.selectGPMBySameNameForIncrease(gatherProjectsTableName, prjName, prjId); - }else{ - sameNamePrjList = gatherDao.selectGPMBySameName(gatherProjectsTableName, prjName, prjId,1); + if (isIncrese) { + sameNamePrjList = gatherDao.selectGPMBySameNameForIncrease( + gatherProjectsTableName, prjName, prjId); + } else { + sameNamePrjList = gatherDao.selectGPMBySameName( + gatherProjectsTableName, prjName, prjId, 1); } - for(int i = 0;i synonymList = getSynonyms(model);//获取项目的同义词 - List synonymsPrjList = getRelatedGatherProjects(model,synonymList,isIncrese); + + // 处理别名相同的项目 + List synonymList = getSynonyms(model);// 获取项目的同义词 + List synonymsPrjList = getRelatedGatherProjects( + model, synonymList, isIncrese); List afterSynonymsPrjList = new ArrayList(); - for(int j = 0;j < synonymsPrjList.size();j++){//不重复比较 + for (int j = 0; j < synonymsPrjList.size(); j++) {// 不重复比较 int tmpId = synonymsPrjList.get(j).getId(); - //logger.info("project " + model.getId() + " and " + synonymsPrjList.get(j).getId() + " match by synonyms"); - if(!relationSet.contains(tmpId) && !afterNamePrjIdList.contains(tmpId) - && !matchedIdList.contains(tmpId))//未处理的,也不同名,否则会重复计算 + // logger.info("project " + model.getId() + " and " + + // synonymsPrjList.get(j).getId() + " match by synonyms"); + if (!relationSet.contains(tmpId) + && !afterNamePrjIdList.contains(tmpId) + && !matchedIdList.contains(tmpId))// 未处理的,也不同名,否则会重复计算 afterSynonymsPrjList.add(synonymsPrjList.get(j)); } - afterSynonymsPrjList = returnDupModelList(model,afterSynonymsPrjList,false); - - afterSynonymsPrjList = mergeTwoList4Source(afterNamePrjList,afterSynonymsPrjList); -// for(GatherProjectsModel m : afterSynonymsPrjList){ -// logger.info("project " + model.getId() + " and " + m.getId() + " match by synonyms/name and verify"); -// } - matchedList = mergeTwoList4Source(matchedList,afterSynonymsPrjList);//同一社区的项目相同homepage比名字相同和别名相同优先级更高 + afterSynonymsPrjList = returnDupModelList(model, afterSynonymsPrjList, + false); + + afterSynonymsPrjList = mergeTwoList4Source(afterNamePrjList, + afterSynonymsPrjList); + // for(GatherProjectsModel m : afterSynonymsPrjList){ + // logger.info("project " + model.getId() + " and " + m.getId() + + // " match by synonyms/name and verify"); + // } + matchedList = mergeTwoList4Source(matchedList, afterSynonymsPrjList);// 同一社区的项目相同homepage比名字相同和别名相同优先级更高 relationSet.add(model.getId()); - if(matchedList.size()==0) + if (matchedList.size() == 0) return relationSet; - - //递归调用有重复关系的项目 - for(int i = 0; i < matchedList.size(); i++){ + + // 递归调用有重复关系的项目 + for (int i = 0; i < matchedList.size(); i++) { GatherProjectsModel dupModel = matchedList.get(i); - if(!relationSet.contains(dupModel.getId())){ + if (!relationSet.contains(dupModel.getId())) { relationSet.add(dupModel.getId()); - if(dupModel.getUpdate_mark()!=2) - findDupProject(dupModel,relationSet,isIncrese); + if (dupModel.getUpdate_mark() != 2) + findDupProject(dupModel, relationSet, isIncrese); } } return relationSet; } - //分别对别名和关系进行插表操作:所有别名汇聚到正在处理的项目上(handlenewproject)。 - //对于重复关系的项目标记update_mark=2,之后不再重复操作 - @Transactional(propagation=Propagation.REQUIRED) - public void OperateTable(Set theRelationSet,boolean isIncrease){ - - if(theRelationSet==null) + + // 分别对别名和关系进行插表操作:所有别名汇聚到正在处理的项目上(handlenewproject)。 + // 对于重复关系的项目标记update_mark=2,之后不再重复操作 + @Transactional(propagation = Propagation.REQUIRED) + public void OperateTable(Set theRelationSet, boolean isIncrease) { + + if (theRelationSet == null) return; - + String finallyRelationStr = ","; StringBuilder relationStr = new StringBuilder(","); - + List eddRelationsList = new ArrayList(); List mergeList = new ArrayList(); - //由于递归的原因,考虑可能多个还有来自同一社区的项目 + // 由于递归的原因,考虑可能多个还有来自同一社区的项目 List finalIdList = new ArrayList(); - Map> map = new HashMap>(); - for (int id:theRelationSet) {//该集合包括处理项目本身 + Map> map = new HashMap>(); + for (int id : theRelationSet) {// 该集合包括处理项目本身 mergeList.add(gatherDao.selectGPMById(gatherProjectsTableName, id)); } - for(GatherProjectsModel model1:mergeList){ - //handleCount++; - if(map.get(model1.getSource())==null){ - map.put(model1.getSource(),new ArrayList()); + for (GatherProjectsModel model1 : mergeList) { + // handleCount++; + if (map.get(model1.getSource()) == null) { + map.put(model1.getSource(), + new ArrayList()); map.get(model1.getSource()).add(model1); - } - else + } else map.get(model1.getSource()).add(model1); } - for(String uniqueSource:map.keySet()){ - GatherProjectsModel uniqueModel = getUniqueSourceModel(map.get(uniqueSource)); - for (GatherProjectsModel tmp : map.get(uniqueSource)){ - if(!tmp.equals(uniqueModel)){ - //插入关系表 - String tmpStr = ","+tmp.getId()+","; - if(tmp.getUpdate_mark()!=2){ - dbSource.insertEddRelations(eddRelationTableName, tmpStr); - gatherDao.updateMark(gatherProjectsTableName, 2, tmp.getId()); + for (String uniqueSource : map.keySet()) { + GatherProjectsModel uniqueModel = getUniqueSourceModel(map + .get(uniqueSource)); + for (GatherProjectsModel tmp : map.get(uniqueSource)) { + if (!tmp.equals(uniqueModel)) { + // 插入关系表 + String tmpStr = "," + tmp.getId() + ","; + if (tmp.getUpdate_mark() != 2) { + dbSource.insertEddRelations(eddRelationTableName, + tmpStr); + gatherDao.updateMark(gatherProjectsTableName, 2, + tmp.getId()); removeSet.add(tmp.getId()); } } } finalIdList.add(uniqueModel.getId()); - finallyRelationStr = finallyRelationStr + uniqueModel.getId()+","; + finallyRelationStr = finallyRelationStr + uniqueModel.getId() + ","; relationStr.append(uniqueModel.getId()); relationStr.append(","); } - //增量数据的处理 - if(isIncrease){ - - for(int id:finalIdList){//从关系表取出重复关系 - EddRelations tmpRelation = dbSource.getEddRelationsByGatherProjectsId(eddRelationTableName, id); - if(tmpRelation!=null) + // 增量数据的处理 + if (isIncrease) { + + for (int id : finalIdList) {// 从关系表取出重复关系 + EddRelations tmpRelation = dbSource + .getEddRelationsByGatherProjectsId( + eddRelationTableName, id); + if (tmpRelation != null) eddRelationsList.add(tmpRelation); } - - if(eddRelationsList !=null && eddRelationsList.size()!=0){ - for(EddRelations relation: eddRelationsList ){ - String tmp = StringHandler.removeFirstComma(relation.getGather_projects_ids()); + + if (eddRelationsList != null && eddRelationsList.size() != 0) { + for (EddRelations relation : eddRelationsList) { + String tmp = StringHandler.removeFirstComma(relation + .getGather_projects_ids()); relationStr.append(tmp); } } - finallyRelationStr = StringHandler.removeDupIdInStr(relationStr.toString()); + finallyRelationStr = StringHandler.removeDupIdInStr(relationStr + .toString()); logger.info(finallyRelationStr); - } try { - if(!finallyRelationStr.equals(",")){ - for(int id:finalIdList){ - GatherProjectsModel model1 = gatherDao.selectGPMById(gatherProjectsTableName, id); - if(model1.getUpdate_mark()!=2){ + if (!finallyRelationStr.equals(",")) { + for (int id : finalIdList) { + GatherProjectsModel model1 = gatherDao.selectGPMById( + gatherProjectsTableName, id); + if (model1.getUpdate_mark() != 2) { gatherDao.updateMark(gatherProjectsTableName, 2, id); removeSet.add(model1.getId()); } } - dbSource.insertEddRelations(eddRelationTableName, finallyRelationStr); - if(isIncrease)//增量数据的关系,删除以前的,插入最新的关系 - for(EddRelations relation: eddRelationsList ){ + dbSource.insertEddRelations(eddRelationTableName, + finallyRelationStr); + if (isIncrease)// 增量数据的关系,删除以前的,插入最新的关系 + for (EddRelations relation : eddRelationsList) { logger.info("delete before relations"); - dbSource.deleteEddRelationsItem(eddRelationTableName, relation.getId()); + dbSource.deleteEddRelationsItem(eddRelationTableName, + relation.getId()); } } } catch (Exception e) { e.printStackTrace(); logger.info(finallyRelationStr); } - + } - - - //找到当前项目同义词关联的项目 要求项目不相同 因为同一个项目可能包含多个同义词 - public List getRelatedGatherProjects(GatherProjectsModel model, List synonymList,boolean isIncrease){ - + + // 找到当前项目同义词关联的项目 要求项目不相同 因为同一个项目可能包含多个同义词 + public List getRelatedGatherProjects( + GatherProjectsModel model, List synonymList, + boolean isIncrease) { + List result = new ArrayList(); - Map map = new HashMap(); + Map map = new HashMap(); GatherProjectsModel tmp; - for(String name:synonymList){ - List prjList= dbSource.getSynonymBySynonyms(synonymsTableName, name); - for(int prjId:prjList){ - if(prjId==model.getId()) + for (String name : synonymList) { + List prjList = dbSource.getSynonymBySynonyms( + synonymsTableName, name); + for (int prjId : prjList) { + if (prjId == model.getId()) continue; - if(map.containsKey(prjId)) + if (map.containsKey(prjId)) continue; tmp = gatherDao.selectGPMById(gatherProjectsTableName, prjId); - if(!isIncrease && tmp.getUpdate_mark()==2) + if (!isIncrease && tmp.getUpdate_mark() == 2) continue; map.put(prjId, tmp); } } - for (GatherProjectsModel value : map.values()) { + for (GatherProjectsModel value : map.values()) { result.add(value); - } + } return result; } - - public List returnDupModelList(GatherProjectsModel model,List list,boolean samename){ + + public List returnDupModelList( + GatherProjectsModel model, List list, + boolean samename) { List returnList = new ArrayList(); - for(GatherProjectsModel dupModel:list){ - if(!dupModel.getSource().equals(model.getSource())){ - if(isTheSame(model,dupModel,samename)) + for (GatherProjectsModel dupModel : list) { + if (!dupModel.getSource().equals(model.getSource())) { + if (isTheSame(model, dupModel, samename)) returnList.add(dupModel); } } - //如果重复列表有来自同一社区的,则再进行比较相似度 + // 如果重复列表有来自同一社区的,则再进行比较相似度 List afterReturnList = new ArrayList(); - Map> map = new HashMap>(); + Map> map = new HashMap>(); GatherProjectsModel uniqueModel = new GatherProjectsModel(); - for(GatherProjectsModel model1:returnList){ - if(map.get(model1.getSource())==null){ - map.put(model1.getSource(),new ArrayList()); + for (GatherProjectsModel model1 : returnList) { + if (map.get(model1.getSource()) == null) { + map.put(model1.getSource(), + new ArrayList()); map.get(model1.getSource()).add(model1); - } - else + } else map.get(model1.getSource()).add(model1); } - for(String uniqueSource:map.keySet()){ + for (String uniqueSource : map.keySet()) { uniqueModel = getUniqueSourceModel(map.get(uniqueSource)); - - for (GatherProjectsModel tmp : map.get(uniqueSource)){ - if(!tmp.equals(uniqueModel) && tmp.getUpdate_mark()!=2){ - //插入关系表 - String relationStr = ","+tmp.getId()+","; - dbSource.insertEddRelations(eddRelationTableName, relationStr); - gatherDao.updateMark(gatherProjectsTableName, 2, tmp.getId()); - //handleCount++; + + for (GatherProjectsModel tmp : map.get(uniqueSource)) { + if (!tmp.equals(uniqueModel) && tmp.getUpdate_mark() != 2) { + // 插入关系表 + String relationStr = "," + tmp.getId() + ","; + dbSource.insertEddRelations(eddRelationTableName, + relationStr); + gatherDao.updateMark(gatherProjectsTableName, 2, + tmp.getId()); + // handleCount++; removeSet.add(tmp.getId()); } } - if(!uniqueModel.getSource().equals("github")) + if (!uniqueModel.getSource().equals("github")) afterReturnList.add(uniqueModel); -// else -// if(compareGithubProject(uniqueModel, gitModel)) -// gitModel = uniqueModel; - + // else + // if(compareGithubProject(uniqueModel, gitModel)) + // gitModel = uniqueModel; + } return afterReturnList; - + } - - public boolean compareGithubProject(GatherProjectsModel model1,GatherProjectsModel model2){ - - if(platformProjectDao.getGithubPrjByUrl(model1.getUrl()).getStargazers_count() - > platformProjectDao.getGithubPrjByUrl(model2.getUrl()).getStargazers_count()) + + public boolean compareGithubProject(GatherProjectsModel model1, + GatherProjectsModel model2) { + + if (platformProjectDao.getGithubPrjByUrl(model1.getUrl()) + .getStargazers_count() > platformProjectDao.getGithubPrjByUrl( + model2.getUrl()).getStargazers_count()) return true; else return false; - + } - - public GatherProjectsModel getUniqueSourceModel(List list){ + + public GatherProjectsModel getUniqueSourceModel( + List list) { GatherProjectsModel returnModel = new GatherProjectsModel(); String source = list.get(0).getSource(); - if(list.size()==1) + if (list.size() == 1) returnModel = list.get(0); - else{ + else { GatherProjectsModel tmp = list.get(0); - for(GatherProjectsModel model1:list){ - if(model1.getUrl()==null || model1.getUrl().equals("")) + for (GatherProjectsModel model1 : list) { + if (model1.getUrl() == null || model1.getUrl().equals("")) continue; - if(source.equals("apache")||source.equals("freecode")) + if (source.equals("apache") || source.equals("freecode")) returnModel = list.get(0); - if(source.equals("oschina")){ + if (source.equals("oschina")) { try { - int a = platformProjectDao.getOschinaPrjByUrl(oschinaTableName,model1.getUrl()).getStar_num(); - int b = platformProjectDao.getOschinaPrjByUrl(oschinaTableName,tmp.getUrl()).getStar_num(); - if(a > b){ + int a = platformProjectDao.getOschinaPrjByUrl( + oschinaTableName, model1.getUrl()) + .getStar_num(); + int b = platformProjectDao.getOschinaPrjByUrl( + oschinaTableName, tmp.getUrl()).getStar_num(); + if (a > b) { tmp = model1; } } catch (Exception e) { - System.out.println(model1.getSource()+" ==== "+model1.getUrl()); - System.out.println(tmp.getSource()+" ==== "+tmp.getUrl()); + System.out.println(model1.getSource() + " ==== " + + model1.getUrl()); + System.out.println(tmp.getSource() + " ==== " + + tmp.getUrl()); } } - if(source.equals("openhub")){ + if (source.equals("openhub")) { try { - int a = platformProjectDao.getOpenHubPrjByUrl(openhubTableName,model1.getUrl()).getUser_num(); - int b = platformProjectDao.getOpenHubPrjByUrl(openhubTableName,tmp.getUrl()).getUser_num(); - if(a > b){ + int a = platformProjectDao.getOpenHubPrjByUrl( + openhubTableName, model1.getUrl()) + .getUser_num(); + int b = platformProjectDao.getOpenHubPrjByUrl( + openhubTableName, tmp.getUrl()).getUser_num(); + if (a > b) { tmp = model1; } } catch (Exception e) { - System.out.println(model1.getSource()+" ==== "+model1.getUrl()); - System.out.println(tmp.getSource()+" ==== "+tmp.getUrl()); + System.out.println(model1.getSource() + " ==== " + + model1.getUrl()); + System.out.println(tmp.getSource() + " ==== " + + tmp.getUrl()); } } - if(source.equals("sourceforge")){ - try{ - int a = platformProjectDao.getSourceForgePrjByUrl(sourceforgeTableName,model1.getUrl()).getDownload_num(); - int b = platformProjectDao.getSourceForgePrjByUrl(sourceforgeTableName,tmp.getUrl()).getDownload_num(); - if(a > b){ + if (source.equals("sourceforge")) { + try { + int a = platformProjectDao.getSourceForgePrjByUrl( + sourceforgeTableName, model1.getUrl()) + .getDownload_num(); + int b = platformProjectDao.getSourceForgePrjByUrl( + sourceforgeTableName, tmp.getUrl()) + .getDownload_num(); + if (a > b) { tmp = model1; } + } catch (Exception e) { + System.out.println(model1.getSource() + " ==== " + + model1.getUrl()); + System.out.println(tmp.getSource() + " ==== " + + tmp.getUrl()); } - catch(Exception e){ - System.out.println(model1.getSource()+" ==== "+model1.getUrl()); - System.out.println(tmp.getSource()+" ==== "+tmp.getUrl()); - } - + } } returnModel = tmp; } return returnModel; } - - - public boolean isTheSame(GatherProjectsModel model1, GatherProjectsModel model2,boolean sameName){ + + public boolean isTheSame(GatherProjectsModel model1, + GatherProjectsModel model2, boolean sameName) { double similarity; int similarTagNum; - boolean isEqual=false; - if(model1.getSource().equals(model2.getSource())) + boolean isEqual = false; + if (model1.getSource().equals(model2.getSource())) return false; - //sourceforge描述信息为空时默认为No description...汇总应该直接处理为空 - if(model1.getSource().equals("sourceforge")){ - if(model1.getDescription()!=null && !model1.getDescription().equals("")) - if(model1.getDescription().equals("No description "+model1.getName()+" Web Site")) + // sourceforge描述信息为空时默认为No description...汇总应该直接处理为空 + if (model1.getSource().equals("sourceforge")) { + if (model1.getDescription() != null + && !model1.getDescription().equals("")) + if (model1.getDescription().equals( + "No description " + model1.getName() + " Web Site")) model1.setDescription(null); } - if(model2.getSource().equals("sourceforge")){ - if(model2.getDescription()!=null && !model2.getDescription().equals("")) - if(model2.getDescription().equals("No description "+model2.getName()+" Web Site")) + if (model2.getSource().equals("sourceforge")) { + if (model2.getDescription() != null + && !model2.getDescription().equals("")) + if (model2.getDescription().equals( + "No description " + model2.getName() + " Web Site")) model2.setDescription(null); } - //oschina的tag和description都为中文 - if(model1.getSource().equals("oschina")||model2.getSource().equals("oschina")){ - if(sameName){ - if(model1.getSource().equals("oschina")) - isEqual = StringHandler.compareLanguage(model1.getLanguage(),model2.getLanguage()); - if(model2.getSource().equals("oschina")) - isEqual = StringHandler.compareLanguage(model2.getLanguage(),model1.getLanguage()); - if (isEqual) - return true; - } + // oschina的tag和description都为中文 + if (model1.getSource().equals("oschina") + || model2.getSource().equals("oschina")) { + if (sameName) { + if (model1.getSource().equals("oschina")) + isEqual = StringHandler.compareLanguage( + model1.getLanguage(), model2.getLanguage()); + if (model2.getSource().equals("oschina")) + isEqual = StringHandler.compareLanguage( + model2.getLanguage(), model1.getLanguage()); + if (isEqual) + return true; + } } - //获取两个对象的tags属性 + // 获取两个对象的tags属性 String tags1 = model1.getTags(); String tags2 = model2.getTags(); similarTagNum = calSimilarityByTag(model1, model2); -//System.out.println("calSimilarityByTag "+model1.getId()+" and "+model2.getId()+" : "+String.valueOf(similarTagNum)); - if(sameName){//相同名字的项目 - if(similarTagNum >=1) + // System.out.println("calSimilarityByTag "+model1.getId()+" and "+model2.getId()+" : "+String.valueOf(similarTagNum)); + if (sameName) {// 相同名字的项目 + if (similarTagNum >= 1) return true; - else{ + else { similarity = calSimilarity(model1, model2); -//System.out.println("same name to calSimilarityByDescrption between "+model1.getId()+" and "+model2.getId()+" : "+similarity); - if(similarity>0.2) + // System.out.println("same name to calSimilarityByDescrption between "+model1.getId()+" and "+model2.getId()+" : "+similarity); + if (similarity > 0.2) + return true; + else + return false; + } + } else {// 相同别名的项目 + if (similarTagNum > 1) + return true; + else { + similarity = calSimilarity(model1, model2); + // System.out.println("same synonyms to calSimilarityByDescrption between "+model1.getId()+" and "+model2.getId()+" : "+similarity); + if (similarity >= 0.5) return true; else return false; } } - else{//相同别名的项目 - if(similarTagNum >1) - return true; - else{ - similarity = calSimilarity(model1, model2); -//System.out.println("same synonyms to calSimilarityByDescrption between "+model1.getId()+" and "+model2.getId()+" : "+similarity); - if(similarity>=0.5) - return true; - else - return false; - } - } - + } - - public double calSimilarity(GatherProjectsModel model1, GatherProjectsModel model2){ - if(model1.getDescription()==null || model1.getDescription().equals("")||model2.getDescription()==null ||model2.getDescription().equals("")) + + public double calSimilarity(GatherProjectsModel model1, + GatherProjectsModel model2) { + if (model1.getDescription() == null + || model1.getDescription().equals("") + || model2.getDescription() == null + || model2.getDescription().equals("")) return 0; NGram ngram = new NGram(4); - return (1-ngram.distance(model1.getDescription(),model2.getDescription())); + return (1 - ngram.distance(model1.getDescription(), + model2.getDescription())); } - //计算都有标签的项目之间的相似度 - public int calSimilarityByTag(GatherProjectsModel model1, GatherProjectsModel model2){ - if(model1.getTags()==null || model1.getTags().equals("")||model2.getTags()==null ||model2.getTags().equals("")) + + // 计算都有标签的项目之间的相似度 + public int calSimilarityByTag(GatherProjectsModel model1, + GatherProjectsModel model2) { + if (model1.getTags() == null || model1.getTags().equals("") + || model2.getTags() == null || model2.getTags().equals("")) return 0; String[] tagArray_model1 = model1.getTags().split(","); String[] tagArray_model2 = model2.getTags().split(","); if (model1.getSource().equals("sourceforge")) - tagArray_model1 = StringHandler.splitTagsByBracket(model1.getTags()); + tagArray_model1 = StringHandler + .splitTagsByBracket(model1.getTags()); if (model2.getSource().equals("sourceforge")) - tagArray_model2 = StringHandler.splitTagsByBracket(model2.getTags()); + tagArray_model2 = StringHandler + .splitTagsByBracket(model2.getTags()); List tags_model1 = changeArrayToList(tagArray_model1); List tags_model2 = changeArrayToList(tagArray_model2); - int sameNum = getSameTagsNum(tags_model1,tags_model2); + int sameNum = getSameTagsNum(tags_model1, tags_model2); return sameNum; } - - public List changeArrayToList(String[] strs){ + + public List changeArrayToList(String[] strs) { List result = new ArrayList(); - for(String str:strs){ - str = RegexHandler.extractEngDecimalAndChinese(str);//对标签进行处理 + for (String str : strs) { + str = RegexHandler.extractEngDecimalAndChinese(str);// 对标签进行处理 result.add(str); } return result; } - - public String deleteHttpPre(String homepage){ - if(homepage != null && !"".equals(homepage)){ + + public String deleteHttpPre(String homepage) { + if (homepage != null && !"".equals(homepage)) { homepage = homepage.trim(); int index_http = homepage.indexOf("http://"); int index_https = homepage.indexOf("https://"); - if(index_http == 0){ - homepage = homepage.substring(index_http + 7); //去除http:// - }else if(index_https == 0){ - homepage = homepage.substring(index_https + 8); //去除https:// - }else{ - //表示不是以http或https开头 + if (index_http == 0) { + homepage = homepage.substring(index_http + 7); // 去除http:// + } else if (index_https == 0) { + homepage = homepage.substring(index_https + 8); // 去除https:// + } else { + // 表示不是以http或https开头 } - - if((homepage.lastIndexOf("/") == homepage.length()-1) && (homepage.length() != 0)){ - homepage = homepage.substring(0, homepage.length() - 1);//如果homepage的最后一个字符是/ 需要去除掉 + + if ((homepage.lastIndexOf("/") == homepage.length() - 1) + && (homepage.length() != 0)) { + homepage = homepage.substring(0, homepage.length() - 1);// 如果homepage的最后一个字符是/ + // 需要去除掉 } } return homepage; } - - public static boolean isRightUrlByRegex(String urlStr){ - Pattern p = Pattern.compile("http(s)?://([\\w-]+\\.)+[\\w-]+(/[\\w- ./?%&=]*)?",Pattern.CASE_INSENSITIVE ); - Matcher m = p.matcher(urlStr); - if(m.find()){ - //System.out.println(m.group()); - return true; - }else - return false; + + public static boolean isRightUrlByRegex(String urlStr) { + Pattern p = Pattern.compile( + "http(s)?://([\\w-]+\\.)+[\\w-]+(/[\\w- ./?%&=]*)?", + Pattern.CASE_INSENSITIVE); + Matcher m = p.matcher(urlStr); + if (m.find()) { + // System.out.println(m.group()); + return true; + } else + return false; } - public List removeDupStr(List array){ - List list = new ArrayList(); - Set set = new HashSet(); - for(String s : array){ - if(set.add(s.toLowerCase())){ - list.add(s); - } - } - return list; - } - - //将两个list进行合并 - public List mergeTwoList(List list1, List list2){ + public List removeDupStr(List array) { + List list = new ArrayList(); + Set set = new HashSet(); + for (String s : array) { + if (set.add(s.toLowerCase())) { + list.add(s); + } + } + return list; + } + + // 将两个list进行合并 + public List mergeTwoList( + List list1, List list2) { List result = new ArrayList(); Set set = new HashSet(); - for(GatherProjectsModel model:list1){ - if(set.add(model.getId())) + for (GatherProjectsModel model : list1) { + if (set.add(model.getId())) result.add(model); } - for(GatherProjectsModel model:list2){ - if(set.add(model.getId())) + for (GatherProjectsModel model : list2) { + if (set.add(model.getId())) result.add(model); } return result; } - - public List mergeTwoList4Source(List list1, List list2){ + + public List mergeTwoList4Source( + List list1, List list2) { List result = new ArrayList(); Set set = new HashSet(); Set sourceSet = new HashSet(); - for(GatherProjectsModel model:list1){ + for (GatherProjectsModel model : list1) { sourceSet.add(model.getSource()); - if(set.add(model.getId())) + if (set.add(model.getId())) result.add(model); } - for(GatherProjectsModel model:list2){ - if(set.add(model.getId())&& sourceSet.add(model.getSource())) + for (GatherProjectsModel model : list2) { + if (set.add(model.getId()) && sourceSet.add(model.getSource())) result.add(model); } return result; } - - public int getSameTagsNum(List list1, List list2){ + + public int getSameTagsNum(List list1, List list2) { int count = 0; - for (String str:list1){ - if(isExist(str,list2)) + for (String str : list1) { + if (isExist(str, list2)) count++; } return count; - + } - - public boolean isExist(String input,List list){ - boolean b=false; - for(String str:list){ - if(str.toLowerCase().equals(input.toLowerCase())) - return true; - } - return false; + + public boolean isExist(String input, List list) { + boolean b = false; + for (String str : list) { + if (str.toLowerCase().equals(input.toLowerCase())) + return true; + } + return false; } - - public int includeSubStringNum(String str,String subStr){ + + public int includeSubStringNum(String str, String subStr) { int count = 0; - if (str.indexOf(subStr)==-1) - { - return 0; - } - else if(str.indexOf(subStr) != -1) - { - count++; - count += includeSubStringNum(str.substring(str.indexOf(subStr)+subStr.length()),subStr); - return count; - } + if (str.indexOf(subStr) == -1) { + return 0; + } else if (str.indexOf(subStr) != -1) { + count++; + count += includeSubStringNum( + str.substring(str.indexOf(subStr) + subStr.length()), + subStr); + return count; + } return 0; } - - public static void main(String [] args){ - ApplicationContext applicationContext = new ClassPathXmlApplicationContext("classpath:/applicationContext*.xml"); - MergeProjectNew2 Main = applicationContext.getBean(MergeProjectNew2.class); + + public static void main(String[] args) { + ApplicationContext applicationContext = new ClassPathXmlApplicationContext( + "classpath:/applicationContext*.xml"); + MergeProjectNew2 Main = applicationContext + .getBean(MergeProjectNew2.class); Main.test(); } - - public void test(){ - GatherProjectsModel model = gatherDao.selectGPMById(gatherProjectsTableName, 36707); - handleNewProject(model,false); + + public void test() { + GatherProjectsModel model = gatherDao.selectGPMById( + gatherProjectsTableName, 1); + handleNewProject(model, false); } }