Merge branch 'master' of https://git.trustie.net/gcm3651/ossean
This commit is contained in:
commit
4e3ff50396
|
@ -16,7 +16,6 @@ public class ClearTable {
|
|||
Logger logger = Logger.getLogger(ClearTable.class);
|
||||
public void start(){
|
||||
truncateTable(TableName.eddRelationTableName);
|
||||
//updateFlag(TableName.synonymsTableName, null);
|
||||
updateMark(TableName.gatherProjectsTableName,1);
|
||||
deleteItemInEddPointers(TableName.pointerTableName,TableName.gatherProjectsTableName, TableName.eddRelationTableName);
|
||||
logger.info("完成去重程序运行前数据表的清空");
|
||||
|
@ -117,7 +116,7 @@ public class ClearTable {
|
|||
Connection conn = null;
|
||||
try {
|
||||
Class.forName("com.mysql.jdbc.Driver").newInstance();
|
||||
conn = DriverManager.getConnection("jdbc:mysql://localhost:3306/ossean_production", "root", "NUDTpdl@123");
|
||||
conn = DriverManager.getConnection(TableName.connection, TableName.user, TableName.psw);
|
||||
} catch (InstantiationException e) {
|
||||
e.printStackTrace();
|
||||
} catch (IllegalAccessException e) {
|
||||
|
|
|
@ -81,7 +81,7 @@ public class ClearTableBeforTransfer {
|
|||
Connection conn = null;
|
||||
try {
|
||||
Class.forName("com.mysql.jdbc.Driver").newInstance();
|
||||
conn = DriverManager.getConnection("jdbc:mysql://localhost:3306/ossean_production", "root", "NUDTpdl@123");
|
||||
conn = DriverManager.getConnection(TableName.connection,TableName.user, TableName.psw);
|
||||
} catch (InstantiationException e) {
|
||||
e.printStackTrace();
|
||||
} catch (IllegalAccessException e) {
|
||||
|
|
|
@ -51,6 +51,7 @@ public class MergeProjects {
|
|||
}
|
||||
|
||||
public void start(){
|
||||
logger.info("start remove projects!");
|
||||
int count=0;
|
||||
count = readPointer(pointerTableName,sourceTableName,targetTableName, count);//指针表计量处理的项目数
|
||||
long start = System.currentTimeMillis();
|
||||
|
@ -73,7 +74,7 @@ public class MergeProjects {
|
|||
//logger.info("Duplicate remove:handling project " + model.getId());
|
||||
haveRemovedSet.addAll(mergeProjectNew2.handleNewProject(model));
|
||||
}
|
||||
dbSource.updatePointer(pointerTableName, sourceTableName, targetTableName, haveRemovedSet.size());
|
||||
dbSource.updatePointer(pointerTableName, sourceTableName, targetTableName, count+haveRemovedSet.size());
|
||||
if(haveRemovedSet.size()%10000==0)
|
||||
logger.warn("deal with:"+haveRemovedSet.size()+" projects cost: "+(float)(System.currentTimeMillis() - start)/60000+" minutes");
|
||||
}
|
||||
|
@ -87,7 +88,7 @@ public class MergeProjects {
|
|||
MergeProjects Main = applicationContext.getBean(MergeProjects.class);
|
||||
GetSynonyms getSynonyms = (GetSynonyms) applicationContext.getBean("getSynonyms");
|
||||
getSynonyms.start();
|
||||
System.out.println("insertSynonyms thread start~");
|
||||
System.out.println("insertSynonyms thread start!");
|
||||
Main.start();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -8,7 +8,15 @@ public class TableName {
|
|||
public static String taggingsTableName = "taggings";
|
||||
public static String tagsTableName = "tags";
|
||||
public static String eddRelationTableName = "edd_relations_17";
|
||||
public static String openSourceProjectsTableName = "open_source_projects";
|
||||
public static String openSourceProjectsTableName = "open_source_projects_17";
|
||||
public static String sourceforgeTableName = "sourceforge_project";
|
||||
public static String openhubTableName = "openhub_project";
|
||||
public static String oschinaTableName = "oschina_project";
|
||||
|
||||
public static String connection = "jdbc:mysql://localhost:3306/ossean_production";
|
||||
public static String user = "root";
|
||||
public static String psw = "NUDTpdl@123";
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -10,13 +10,13 @@ import com.ossean.model.SourceForgeProject;
|
|||
|
||||
public interface PlatformProjectDao {
|
||||
|
||||
@Select("select name,rate_num,user_num from openhub_project where url = #{url} group by url_md5 order by extracted_time desc")
|
||||
public OpenhubProject getOpenHubPrjByUrl( @Param("url") String url);
|
||||
@Select("select name,rate_num,user_num from ${table} where url = #{url} group by url_md5 order by extracted_time desc")
|
||||
public OpenhubProject getOpenHubPrjByUrl( @Param("table") String table,@Param("url") String url);
|
||||
|
||||
@Select("select name,download_num,favor_num from sourceforge_project where url = #{url} group by url_md5 order by extracted_time desc")
|
||||
public SourceForgeProject getSourceForgePrjByUrl( @Param("url") String url);//oschina_project
|
||||
@Select("select name,download_num,favor_num from ${table} where url = #{url} group by url_md5 order by extracted_time desc")
|
||||
public SourceForgeProject getSourceForgePrjByUrl(@Param("table") String table, @Param("url") String url);//oschina_project
|
||||
|
||||
@Select("select name,star_num,user_num from sourceforge_project where url = #{url} group by url_md5 order by extracted_time desc")
|
||||
public OschinaProject getOschinaPrjByUrl(@Param("url") String url);
|
||||
@Select("select name,star_num,user_num from ${table} where url = #{url} group by url_md5 order by extracted_time desc")
|
||||
public OschinaProject getOschinaPrjByUrl(@Param("table") String table,@Param("url") String url);
|
||||
|
||||
}
|
||||
|
|
|
@ -55,8 +55,11 @@ public class MergeProjectNew2 {
|
|||
private String synonymsTableName = TableName.synonymsTableName;
|
||||
private String gatherProjectsTableName = TableName.gatherProjectsTableName;
|
||||
private String eddRelationTableName = TableName.eddRelationTableName;
|
||||
private String sourceforgeTableName = TableName.sourceforgeTableName;
|
||||
private String openhubTableName = TableName.openhubTableName;
|
||||
private String oschinaTableName = TableName.oschinaTableName;
|
||||
|
||||
private static Set<Integer> passSet;
|
||||
//private static Set<Integer> passSet;
|
||||
private static Set<Integer> relationSet;
|
||||
|
||||
public List<String> getSynonyms(GatherProjectsModel model){
|
||||
|
@ -67,15 +70,19 @@ public class MergeProjectNew2 {
|
|||
//处理新项目
|
||||
|
||||
public Set<Integer> handleNewProject(GatherProjectsModel model){
|
||||
passSet = new HashSet<Integer>();//已经处理的项目,避免重复处理陷入死锁
|
||||
if(model.getUpdate_mark()==2)
|
||||
return null;
|
||||
//passSet = new HashSet<Integer>();//已经处理的项目,避免重复处理陷入死锁
|
||||
relationSet = new HashSet<Integer>();//所有重复的项目集合
|
||||
relationSet = findDupProject(model,relationSet,passSet);
|
||||
relationSet = findDupProject(model,relationSet);
|
||||
OperateTable(relationSet, model.getId());//表操作
|
||||
return relationSet;
|
||||
}
|
||||
|
||||
public Set<Integer> findDupProject(GatherProjectsModel model,Set<Integer> relationSet,Set<Integer> passSet){
|
||||
public Set<Integer> findDupProject(GatherProjectsModel model,Set<Integer> relationSet){
|
||||
|
||||
if(gatherDao.selectGPMById(gatherProjectsTableName, model.getId()).getUpdate_mark()==2)
|
||||
return relationSet;
|
||||
String prjName = model.getName();
|
||||
int prjId = model.getId();
|
||||
String prjHomepage = model.getHomepage();
|
||||
|
@ -88,11 +95,17 @@ public class MergeProjectNew2 {
|
|||
if(prjHomepage == null || prjHomepage.equals(""))
|
||||
if(model.getSource().equals("apache"))
|
||||
prjHomepage = model.getUrl();
|
||||
// if(model.getSource().equals("openhub"))
|
||||
// if(!isRightUrlByRegex(model.getHomepage())){
|
||||
// prjHomepage=null;
|
||||
// }
|
||||
//homepage相同认为一定是相同的项目
|
||||
//暂时性对openhub项目进行处理
|
||||
if(model.getSource().equals("openhub")){
|
||||
if(!isRightUrlByRegex(model.getHomepage())||includeSubStringNum(model.getHomepage(), "http")>1){
|
||||
prjHomepage=null;
|
||||
}
|
||||
}
|
||||
|
||||
//处理homepage相同的项目,分别对和当前处理项目同一社区的、不同社区的进行处理
|
||||
List<GatherProjectsModel> sameHomeAndSourceList = new ArrayList<GatherProjectsModel>();
|
||||
sameHomeAndSourceList.add(model);
|
||||
Map<String,List<GatherProjectsModel>> sameHomeMap = new HashMap<String, List<GatherProjectsModel>>();
|
||||
if(prjHomepage != null && !prjHomepage.equals("")){
|
||||
prjHomepage = deleteHttpPre(prjHomepage);//去掉“http&https”前缀及“/”后缀
|
||||
if(prjHomepage.length() > 0){
|
||||
|
@ -104,45 +117,90 @@ public class MergeProjectNew2 {
|
|||
sameHomepageList = gatherDao.selectGPMBySameHomePage(gatherProjectsTableName, prjHomepage, homepage1, homepage2, homepage3, homepage4, model.getId(),1);
|
||||
for(GatherProjectsModel m : sameHomepageList){
|
||||
//logger.info("project " + model.getId() + " and " + m.getId() + " match by homepage");
|
||||
if(!passSet.contains(m.getId()))
|
||||
matchedList.add(m);
|
||||
if(!m.getSource().equals(model.getSource())){
|
||||
if(sameHomeMap.get(m.getSource())==null){
|
||||
sameHomeMap.put(m.getSource(),new ArrayList<GatherProjectsModel>());
|
||||
sameHomeMap.get(m.getSource()).add(m);
|
||||
}
|
||||
else
|
||||
sameHomeMap.get(m.getSource()).add(m);
|
||||
}
|
||||
else{
|
||||
sameHomeAndSourceList.add(m);
|
||||
}
|
||||
}
|
||||
//同一社区,homepage相同,则选出最有影响力的,其余插表,更改当前处理的model指向。
|
||||
if(sameHomeAndSourceList.size() != 0){
|
||||
GatherProjectsModel uniqueModel = getUniqueSourceModel(sameHomeAndSourceList);
|
||||
|
||||
for (GatherProjectsModel tmp : sameHomeAndSourceList){
|
||||
if(!tmp.equals(uniqueModel)){
|
||||
//插入关系表
|
||||
String relationStr = ","+tmp.getId()+",";
|
||||
dbSource.insertEddRelations(eddRelationTableName, relationStr);
|
||||
gatherDao.updateMark(gatherProjectsTableName, 2, tmp.getId());
|
||||
}
|
||||
}
|
||||
model = uniqueModel;
|
||||
}
|
||||
//处理查找的项目有同一source的情况,确保一个source最多只有一个项目有相同homepage
|
||||
for(String uniqueSource:sameHomeMap.keySet()){
|
||||
GatherProjectsModel uniqueModel = getUniqueSourceModel(sameHomeMap.get(uniqueSource));
|
||||
if(!relationSet.contains(uniqueModel.getId()))
|
||||
matchedList.add(uniqueModel);
|
||||
for (GatherProjectsModel tmp : sameHomeMap.get(uniqueSource)){
|
||||
if(!tmp.equals(uniqueModel)){
|
||||
//插入关系表
|
||||
String relationStr = ","+tmp.getId()+",";
|
||||
dbSource.insertEddRelations(eddRelationTableName, relationStr);
|
||||
gatherDao.updateMark(gatherProjectsTableName, 2, tmp.getId());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
//处理同名项目
|
||||
List<GatherProjectsModel> sameNamePrjList = gatherDao.selectGPMBySameName(gatherProjectsTableName, prjName, prjId,1);
|
||||
List<GatherProjectsModel> afterNamePrjList = new ArrayList<GatherProjectsModel>();
|
||||
for(int i = 0;i<sameNamePrjList.size();i++){
|
||||
List<Integer> afterNamePrjIdList = new ArrayList<Integer>();
|
||||
for(int i = 0;i<sameNamePrjList.size();i++){//不重复比较
|
||||
//logger.info("project " + model.getId() + " and " + sameNamePrjList.get(i).getId() + " match by name");
|
||||
if(!passSet.contains(sameNamePrjList.get(i).getId()))//未处理的
|
||||
if(!relationSet.contains(sameNamePrjList.get(i).getId())){//未处理的
|
||||
afterNamePrjList.add(sameNamePrjList.get(i));
|
||||
}
|
||||
}
|
||||
afterNamePrjList = returnDupModelList(model,afterNamePrjList,true);
|
||||
for(GatherProjectsModel a:afterNamePrjList){
|
||||
afterNamePrjIdList.add(a.getId());
|
||||
}
|
||||
//处理别名相同的项目
|
||||
List<String> synonymList = getSynonyms(model);//获取项目的同义词
|
||||
List<GatherProjectsModel> synonymsPrjList = getRelatedGatherProjects(model,synonymList);
|
||||
List<GatherProjectsModel> afterSynonymsPrjList = new ArrayList<GatherProjectsModel>();
|
||||
for(int j = 0;j < synonymsPrjList.size();j++){
|
||||
for(int j = 0;j < synonymsPrjList.size();j++){//不重复比较
|
||||
//logger.info("project " + model.getId() + " and " + synonymsPrjList.get(j).getId() + " match by synonyms");
|
||||
if(!passSet.contains(synonymsPrjList.get(j).getId()) && !afterNamePrjList.contains(model))//未处理的,也不同名,否则会重复计算
|
||||
if(!relationSet.contains(synonymsPrjList.get(j).getId()) && !afterNamePrjIdList.contains(synonymsPrjList.get(j).getId()))//未处理的,也不同名,否则会重复计算
|
||||
afterSynonymsPrjList.add(synonymsPrjList.get(j));
|
||||
}
|
||||
afterSynonymsPrjList = returnDupModelList(model,afterSynonymsPrjList,false);
|
||||
// for(GatherProjectsModel m : afterSynonymsPrjList){
|
||||
// logger.info("project " + model.getId() + " and " + m.getId() + " match by synonyms/name and verify");
|
||||
// }
|
||||
|
||||
afterSynonymsPrjList = mergeTwoList4Source(afterNamePrjList,afterSynonymsPrjList);
|
||||
// for(GatherProjectsModel m : afterSynonymsPrjList){
|
||||
// logger.info("project " + model.getId() + " and " + m.getId() + " match by synonyms/name and verify");
|
||||
// }
|
||||
matchedList = mergeTwoList4Source(matchedList,afterSynonymsPrjList);//同一社区的项目相同homepage比名字相同和别名相同优先级更高
|
||||
relationSet.add(model.getId());
|
||||
passSet.add(model.getId());
|
||||
//passSet.add(model.getId());
|
||||
if(matchedList.size()==0)
|
||||
return relationSet;
|
||||
|
||||
//递归调用有重复关系的项目
|
||||
for(int i = 0; i < matchedList.size(); i++){
|
||||
GatherProjectsModel dupModel = matchedList.get(i);
|
||||
if(!passSet.contains(dupModel.getId())){
|
||||
findDupProject(dupModel,relationSet,passSet);
|
||||
if(!relationSet.contains(dupModel.getId())){
|
||||
findDupProject(dupModel,relationSet);
|
||||
}
|
||||
}
|
||||
return relationSet;
|
||||
|
@ -151,18 +209,53 @@ public class MergeProjectNew2 {
|
|||
//对于重复关系的项目标记update_mark=2,之后不再重复操作
|
||||
@Transactional(propagation=Propagation.REQUIRED)
|
||||
public void OperateTable(Set<Integer> relationSet,int prjId){
|
||||
List<String> dupSynonymsList = new ArrayList<String>();
|
||||
String relationStr = ",";
|
||||
|
||||
String finallyRelationStr = ",";
|
||||
List<GatherProjectsModel> mergeList = new ArrayList<GatherProjectsModel>();
|
||||
Map<String,List<GatherProjectsModel>> map = new HashMap<String, List<GatherProjectsModel>>();
|
||||
for (int id:relationSet) {//该集合包括处理项目本身
|
||||
relationStr = relationStr+id+",";
|
||||
dupSynonymsList.addAll(dbSource.getSynonymByPrjId(synonymsTableName, id));//获取flag=1的同义词(确定的)
|
||||
gatherDao.updateMark(gatherProjectsTableName, 2, id);
|
||||
mergeList.add(gatherDao.selectGPMById(gatherProjectsTableName, id));
|
||||
}
|
||||
dupSynonymsList = removeDupStr(dupSynonymsList);
|
||||
for(String synonyms:dupSynonymsList){
|
||||
dbSource.insertSynonymsNew(synonymsTableName,prjId,synonyms,1);
|
||||
for(GatherProjectsModel model1:mergeList){
|
||||
if(map.get(model1.getSource())==null){
|
||||
map.put(model1.getSource(),new ArrayList<GatherProjectsModel>());
|
||||
map.get(model1.getSource()).add(model1);
|
||||
}
|
||||
else
|
||||
map.get(model1.getSource()).add(model1);
|
||||
}
|
||||
for(String uniqueSource:map.keySet()){
|
||||
GatherProjectsModel uniqueModel = getUniqueSourceModel(map.get(uniqueSource));
|
||||
for (GatherProjectsModel tmp : map.get(uniqueSource)){
|
||||
if(!tmp.equals(uniqueModel)){
|
||||
//插入关系表
|
||||
String relationStr = ","+tmp.getId()+",";
|
||||
dbSource.insertEddRelations(eddRelationTableName, relationStr);
|
||||
gatherDao.updateMark(gatherProjectsTableName, 2, tmp.getId());
|
||||
}
|
||||
else
|
||||
gatherDao.updateMark(gatherProjectsTableName, 2, tmp.getId());
|
||||
}
|
||||
finallyRelationStr = finallyRelationStr + uniqueModel.getId()+",";
|
||||
}
|
||||
|
||||
// List<String> dupSynonymsList = new ArrayList<String>();
|
||||
// for (int id:relationSet) {//该集合包括处理项目本身
|
||||
// relationStr = relationStr+id+",";
|
||||
// dupSynonymsList.addAll(dbSource.getSynonymByPrjId(synonymsTableName, id));//获取flag=1的同义词(确定的)
|
||||
// gatherDao.updateMark(gatherProjectsTableName, 2, id);
|
||||
// }
|
||||
// dupSynonymsList = removeDupStr(dupSynonymsList);
|
||||
// for(String synonyms:dupSynonymsList){
|
||||
// dbSource.insertSynonymsNew(synonymsTableName,prjId,synonyms,1);
|
||||
// }
|
||||
|
||||
try {
|
||||
dbSource.insertEddRelations(eddRelationTableName, finallyRelationStr);
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
logger.info(finallyRelationStr);
|
||||
}
|
||||
}
|
||||
|
||||
//找到当前项目同义词关联的项目 要求项目不相同 因为同一个项目可能包含多个同义词
|
||||
|
@ -212,6 +305,14 @@ public class MergeProjectNew2 {
|
|||
}
|
||||
for(String uniqueSource:map.keySet()){
|
||||
uniqueModel = getUniqueSourceModel(map.get(uniqueSource));
|
||||
for (GatherProjectsModel tmp : map.get(uniqueSource)){
|
||||
if(!tmp.equals(uniqueModel)){
|
||||
//插入关系表
|
||||
String relationStr = ","+tmp.getId()+",";
|
||||
dbSource.insertEddRelations(eddRelationTableName, relationStr);
|
||||
gatherDao.updateMark(gatherProjectsTableName, 2, tmp.getId());
|
||||
}
|
||||
}
|
||||
afterReturnList.add(uniqueModel);
|
||||
}
|
||||
return afterReturnList;
|
||||
|
@ -225,31 +326,49 @@ public class MergeProjectNew2 {
|
|||
returnModel = list.get(0);
|
||||
else{
|
||||
GatherProjectsModel tmp = list.get(0);
|
||||
|
||||
for(GatherProjectsModel model1:list){
|
||||
if(model1.getUrl()==null || model1.getUrl().equals(""))
|
||||
continue;
|
||||
if(source.equals("apache")||source.equals("freecode"))
|
||||
returnModel = list.get(0);
|
||||
// if(source.equals("oschina")){
|
||||
// int a = platformProjectDao.getOschinaPrjByUrl(model1.getUrl()).getStar_num();
|
||||
// int b = platformProjectDao.getOschinaPrjByUrl(tmp.getUrl()).getStar_num();
|
||||
// if(a > b){
|
||||
// tmp = model1;
|
||||
// }
|
||||
// }
|
||||
if(source.equals("openhub")){
|
||||
int a = platformProjectDao.getOpenHubPrjByUrl(model1.getUrl()).getUser_num();
|
||||
int b = platformProjectDao.getOpenHubPrjByUrl(tmp.getUrl()).getUser_num();
|
||||
if(source.equals("oschina")){
|
||||
try {
|
||||
int a = platformProjectDao.getOschinaPrjByUrl(oschinaTableName,model1.getUrl()).getStar_num();
|
||||
int b = platformProjectDao.getOschinaPrjByUrl(oschinaTableName,tmp.getUrl()).getStar_num();
|
||||
if(a > b){
|
||||
tmp = model1;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
System.out.println(model1.getSource()+" ==== "+model1.getUrl());
|
||||
System.out.println(tmp.getSource()+" ==== "+tmp.getUrl());
|
||||
}
|
||||
}
|
||||
if(source.equals("openhub")){
|
||||
try {
|
||||
int a = platformProjectDao.getOpenHubPrjByUrl(openhubTableName,model1.getUrl()).getUser_num();
|
||||
int b = platformProjectDao.getOpenHubPrjByUrl(openhubTableName,tmp.getUrl()).getUser_num();
|
||||
if(a > b){
|
||||
tmp = model1;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
System.out.println(model1.getSource()+" ==== "+model1.getUrl());
|
||||
System.out.println(tmp.getSource()+" ==== "+tmp.getUrl());
|
||||
}
|
||||
}
|
||||
if(source.equals("sourceforge")){
|
||||
int a = platformProjectDao.getSourceForgePrjByUrl(model1.getUrl()).getDownload_num();
|
||||
int b = platformProjectDao.getSourceForgePrjByUrl(tmp.getUrl()).getDownload_num();
|
||||
try{
|
||||
int a = platformProjectDao.getSourceForgePrjByUrl(sourceforgeTableName,model1.getUrl()).getDownload_num();
|
||||
int b = platformProjectDao.getSourceForgePrjByUrl(sourceforgeTableName,tmp.getUrl()).getDownload_num();
|
||||
if(a > b){
|
||||
tmp = model1;
|
||||
}
|
||||
}
|
||||
catch(Exception e){
|
||||
System.out.println(model1.getSource()+" ==== "+model1.getUrl());
|
||||
System.out.println(tmp.getSource()+" ==== "+tmp.getUrl());
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
returnModel = tmp;
|
||||
}
|
||||
|
@ -261,6 +380,19 @@ public class MergeProjectNew2 {
|
|||
double similarity;
|
||||
int similarTagNum;
|
||||
boolean isEqual=false;
|
||||
if(model1.getSource().equals(model2.getSource()))
|
||||
return false;
|
||||
//sourceforge描述信息为空时默认为No description...汇总应该直接处理为空
|
||||
if(model1.getSource().equals("sourceforge")){
|
||||
if(model1.getDescription()!=null && !model1.getDescription().equals(""))
|
||||
if(model1.getDescription().equals("No description "+model1.getName()+" Web Site"))
|
||||
model1.setDescription(null);
|
||||
}
|
||||
if(model2.getSource().equals("sourceforge")){
|
||||
if(model2.getDescription()!=null && !model2.getDescription().equals(""))
|
||||
if(model2.getDescription().equals("No description "+model2.getName()+" Web Site"))
|
||||
model2.setDescription(null);
|
||||
}
|
||||
//oschina的tag和description都为中文
|
||||
if(model1.getSource().equals("oschina")||model2.getSource().equals("oschina")){
|
||||
if(sameName){
|
||||
|
@ -294,8 +426,8 @@ public class MergeProjectNew2 {
|
|||
return true;
|
||||
else{
|
||||
similarity = calSimilarity(model1, model2);
|
||||
//System.out.println("same synonyms to calSimilarityByDescrption between "+model1.getId()+" and "+model2.getId()+" : "+similarity);
|
||||
if(similarity>=0.4)
|
||||
//System.out.println("same synonyms to calSimilarityByDescrption between "+model1.getId()+" and "+model2.getId()+" : "+similarity);
|
||||
if(similarity>=0.5)
|
||||
return true;
|
||||
else
|
||||
return false;
|
||||
|
@ -305,6 +437,8 @@ public class MergeProjectNew2 {
|
|||
}
|
||||
|
||||
public double calSimilarity(GatherProjectsModel model1, GatherProjectsModel model2){
|
||||
if(model1.getDescription()==null || model1.getDescription().equals("")||model2.getDescription()==null ||model2.getDescription().equals(""))
|
||||
return 0;
|
||||
NGram ngram = new NGram(4);
|
||||
return (1-ngram.distance(model1.getDescription(),model2.getDescription()));
|
||||
}
|
||||
|
@ -423,4 +557,30 @@ public class MergeProjectNew2 {
|
|||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public int includeSubStringNum(String str,String subStr){
|
||||
int count = 0;
|
||||
if (str.indexOf(subStr)==-1)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
else if(str.indexOf(subStr) != -1)
|
||||
{
|
||||
count++;
|
||||
count += includeSubStringNum(str.substring(str.indexOf(subStr)+subStr.length()),subStr);
|
||||
return count;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
public static void main(String [] args){
|
||||
ApplicationContext applicationContext = new ClassPathXmlApplicationContext("classpath:/applicationContext*.xml");
|
||||
MergeProjectNew2 Main = applicationContext.getBean(MergeProjectNew2.class);
|
||||
Main.test();
|
||||
}
|
||||
|
||||
public void test(){
|
||||
GatherProjectsModel model = gatherDao.selectGPMById(gatherProjectsTableName, 346);
|
||||
handleNewProject(model);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -160,7 +160,14 @@ public class TransferProjectsUtil2 {
|
|||
//logger.info("处理edd_relations表数据,id:" + relation.getId());
|
||||
List<GatherProjectsModel> gatherProjectList = new ArrayList<GatherProjectsModel>();//最热门项目要放在第一个下标的位置
|
||||
String gather_projects_ids = relation.getGather_projects_ids();
|
||||
if(gather_projects_ids.equals(","))
|
||||
return;
|
||||
try {
|
||||
gather_projects_ids = gather_projects_ids.substring(1, gather_projects_ids.length() - 1);
|
||||
} catch (Exception e1) {
|
||||
System.out.println(relation.getGather_projects_ids());
|
||||
e1.printStackTrace();
|
||||
}
|
||||
String[] idsArray = gather_projects_ids.split(",");
|
||||
boolean error_lost = false;
|
||||
for(int i = 0; i < idsArray.length; i++){
|
||||
|
|
|
@ -62,8 +62,8 @@ public class URLAvailability {
|
|||
|
||||
public static void main(String[] args) {
|
||||
//URLAvailability u = new URLAvailability();
|
||||
System.out.println(isConnect2("http://www.oschina.nethttps://market.aliyun.com/software?spm=5176.1846502.0.0.CpQnwc"));
|
||||
//System.out.println(isRightUrlByRegex("https://www.openhub.net/p/tesseract"));
|
||||
//System.out.println(isConnect2("http://www.oschina.nethttps://market.aliyun.com/software?spm=5176.1846502.0.0.CpQnwc"));
|
||||
System.out.println(isRightUrlByRegex("https://www.openhub.net/p/tesseracthttp://abc.com"));
|
||||
//System.out.println(isConnect("http://www.freecode.com/projects/pdfsam"));
|
||||
}
|
||||
|
||||
|
|
|
@ -19,9 +19,9 @@
|
|||
destroy-method="close">
|
||||
<property name="driverClassName" value="com.mysql.jdbc.Driver" />
|
||||
<property name="url"
|
||||
value="jdbc:mysql://localhost/ossean?characterEncoding=UTF-8&zeroDateTimeBehavior=convertToNull&autoReconnect=true" />
|
||||
value="jdbc:mysql://localhost/ossean_production?characterEncoding=UTF-8&zeroDateTimeBehavior=convertToNull&autoReconnect=true" />
|
||||
<property name="username" value="root" />
|
||||
<property name="password" value="123456" />
|
||||
<property name="password" value="1234" />
|
||||
<property name="validationQuery" value="SELECT 1" />
|
||||
<property name="testOnBorrow" value="true"/>
|
||||
</bean>
|
||||
|
@ -38,9 +38,9 @@
|
|||
destroy-method="close">
|
||||
<property name="driverClassName" value="com.mysql.jdbc.Driver" />
|
||||
<property name="url"
|
||||
value="jdbc:mysql://localhost:3306/ossean?characterEncoding=UTF-8&zeroDateTimeBehavior=convertToNull&autoReconnect=true" />
|
||||
value="jdbc:mysql://localhost:3306/ossean_production?characterEncoding=UTF-8&zeroDateTimeBehavior=convertToNull&autoReconnect=true" />
|
||||
<property name="username" value="root" />
|
||||
<property name="password" value="123456" />
|
||||
<property name="password" value="1234" />
|
||||
<property name="validationQuery" value="SELECT 1" />
|
||||
<property name="testOnBorrow" value="true"/>
|
||||
</bean>
|
||||
|
|
Loading…
Reference in New Issue