project_match with github
This commit is contained in:
parent
8dbf6f62ac
commit
bd2d35aba4
|
@ -11,12 +11,12 @@
|
|||
</arguments>
|
||||
</buildCommand>
|
||||
<buildCommand>
|
||||
<name>org.eclipse.m2e.core.maven2Builder</name>
|
||||
<name>org.springframework.ide.eclipse.core.springbuilder</name>
|
||||
<arguments>
|
||||
</arguments>
|
||||
</buildCommand>
|
||||
<buildCommand>
|
||||
<name>org.springframework.ide.eclipse.core.springbuilder</name>
|
||||
<name>org.eclipse.m2e.core.maven2Builder</name>
|
||||
<arguments>
|
||||
</arguments>
|
||||
</buildCommand>
|
||||
|
|
|
@ -21,7 +21,7 @@
|
|||
<property name="url"
|
||||
value="jdbc:mysql://localhost/ossean_production?characterEncoding=UTF-8&zeroDateTimeBehavior=convertToNull&autoReconnect=true" />
|
||||
<property name="username" value="root" />
|
||||
<property name="password" value="NUDTpdl@123" />
|
||||
<property name="password" value="NUDTpdl@" />
|
||||
<property name="validationQuery" value="SELECT 1" />
|
||||
<property name="testOnBorrow" value="true"/>
|
||||
</bean>
|
||||
|
|
|
@ -48,7 +48,7 @@ public class GetSynonyms{
|
|||
@Resource
|
||||
private PlatformProjectDao paltformDao;
|
||||
|
||||
private static String gatherProjectsTableName ="gather_projects_tmp";//TableName.gatherProjectsTableName;
|
||||
private static String gatherProjectsTableName =TableName.gatherProjectsTableName;//TableName.gatherProjectsTableName;
|
||||
private static String synonymsTableName = TableName.synonymsTableName;
|
||||
private static String pointerTableName = TableName.pointerTableName;
|
||||
private static int batchSize = 1000;
|
||||
|
@ -558,7 +558,7 @@ public class GetSynonyms{
|
|||
if(prjSource.equals("github")) similarDegree = SimilarDegree(extract, Name);
|
||||
if(similarDegree >= 0.6 && similarDegree <1.0)
|
||||
{
|
||||
logger.info("from extract after similar: "+extract);
|
||||
//logger.info("from extract after similar: "+extract);
|
||||
synonymsList_notsure.add(extract);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -41,7 +41,6 @@ public class MergeProjects {
|
|||
private static String sourceTableName = TableName.gatherProjectsTableName;//去重项目的来源,来自汇总程序生成的表
|
||||
private static String targetTableName = TableName.eddRelationTableName;//去重结果存储位置
|
||||
private static int batchSize = 500;
|
||||
//public static Set<Integer> haveRemovedSet = new HashSet<Integer>();
|
||||
//读指针
|
||||
public int readPointer(String table, String source, String target, int minId){
|
||||
int pointer = minId;
|
||||
|
@ -60,7 +59,6 @@ public class MergeProjects {
|
|||
int tmpCount = 0;
|
||||
count = readPointer(pointerTableName,sourceTableName,targetTableName, count);//指针表计量处理的项目数
|
||||
long start = System.currentTimeMillis();
|
||||
//extractedTime = gatherDao.selectMaxExtractedTime(sourceTableName);
|
||||
String timeStr = gatherDao.selectMaxExtractedTime(sourceTableName);
|
||||
DateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
|
||||
try {
|
||||
|
@ -84,7 +82,7 @@ public class MergeProjects {
|
|||
else{
|
||||
for(GatherProjectsModel model:gpmList){
|
||||
int handleCount = 0;
|
||||
//logger.info("Duplicate remove:handling project " + model.getId());
|
||||
logger.info("Duplicate remove:handling project " + model.getId());
|
||||
String modelTimeStr = model.getExtracted_time();
|
||||
Date modelExtractedTime = null;
|
||||
try {
|
||||
|
@ -113,9 +111,6 @@ public class MergeProjects {
|
|||
public static void main(String[] args){
|
||||
ApplicationContext applicationContext = new ClassPathXmlApplicationContext("classpath:/applicationContext*.xml");
|
||||
MergeProjects Main = applicationContext.getBean(MergeProjects.class);
|
||||
// GetSynonyms getSynonyms = (GetSynonyms) applicationContext.getBean("getSynonyms");
|
||||
// getSynonyms.start();
|
||||
// System.out.println("Extract Synonyms thread start!");
|
||||
Main.start();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -12,8 +12,9 @@ public class TableName {
|
|||
public static String sourceforgeTableName = "sourceforge_project";
|
||||
public static String openhubTableName = "openhub_project";
|
||||
public static String oschinaTableName = "oschina_project";
|
||||
public static String githubTableName = "github";
|
||||
public static String connection = "jdbc:mysql://localhost:3306/ossean_production";
|
||||
public static String user = "root";
|
||||
public static String psw = "NUDTpdl@123";
|
||||
public static String psw = "NUDTpdl@";
|
||||
|
||||
}
|
||||
|
|
|
@ -42,7 +42,7 @@ public class TransferProjects {
|
|||
private String taggingTableName = TableName.taggingsTableName;
|
||||
|
||||
private int batchSize = 100;
|
||||
private static Set<Integer> haveTransferSet;
|
||||
//private static Set<Integer> haveTransferSet;
|
||||
|
||||
//读指针
|
||||
public int readPointer(String table, String source, String target){
|
||||
|
@ -68,9 +68,10 @@ public class TransferProjects {
|
|||
|
||||
public void start(){
|
||||
|
||||
haveTransferSet = new HashSet<Integer>();
|
||||
haveTransferSet = dbDest.selectOpenSourceProjectsIds(targetTableName);
|
||||
//haveTransferSet = new HashSet<Integer>();
|
||||
//haveTransferSet = dbDest.selectOpenSourceProjectsIds(targetTableName);
|
||||
int delCount = 0;
|
||||
logger.info("begin transfer projects");
|
||||
while(true){
|
||||
int startId = readPointer(pointerTableName, sourceTableName, targetTableName);
|
||||
List<EddRelations> eddRelationList = transferPrjDao.getEddRelationList(sourceTableName, startId, batchSize);
|
||||
|
@ -78,7 +79,7 @@ public class TransferProjects {
|
|||
try {
|
||||
System.out.println("freecode : "+delCount);
|
||||
logger.info("没有数据了,休眠10min");
|
||||
Thread.sleep(600*1000L);
|
||||
Thread.sleep(3*1000L);
|
||||
continue;
|
||||
} catch (InterruptedException e) {
|
||||
e.printStackTrace();
|
||||
|
@ -96,15 +97,13 @@ public class TransferProjects {
|
|||
String[] idsArray = gather_projects_ids.split(",");
|
||||
for(int i = 0; i < idsArray.length; i++){
|
||||
int id = Integer.parseInt(idsArray[i]);
|
||||
if(haveTransferSet.contains(id)){
|
||||
if(null != dbDest.selectOpenSourceProjectsItem(targetTableName,id)){
|
||||
isUpdate = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
int prjId = transferProjectsUtil2.handleOneRelation(relation,isUpdate);
|
||||
if(prjId!=-1)
|
||||
haveTransferSet.add(prjId);
|
||||
else
|
||||
if(prjId==-1)//统计freecode
|
||||
delCount++;
|
||||
dbSource.updatePointer(pointerTableName, sourceTableName, targetTableName, relation.getId()+1);
|
||||
}
|
||||
|
|
|
@ -41,11 +41,11 @@ public interface DBDest {
|
|||
@Insert("insert into ${table} (id,name,description,"
|
||||
+ "url,url_md5,language,category,homepage,license,"
|
||||
+ "source,created_time,updated_time,extracted_time,"
|
||||
+ "tags,tags_for_search,synonyms,update_mark) values (#{model.id},#{model.name},#{model.description},"
|
||||
+ "tags,tags_for_search,synonyms,update_mark,filtration) values (#{model.id},#{model.name},#{model.description},"
|
||||
+ "#{model.url},#{model.url_md5},#{model.language},"
|
||||
+ "#{model.category},#{model.homepage},#{model.license},#{model.source},"
|
||||
+ "#{model.created_time},#{model.updated_time},#{model.extracted_time},#{model.tags},"
|
||||
+ "#{model.tags_for_search},#{model.synonyms},#{model.update_mark})")
|
||||
+ "#{model.tags_for_search},#{model.synonyms},#{model.update_mark},#{model.filtration})")
|
||||
public void insertOsp(@Param("table") String table, @Param("model") OpenSourceProject model);
|
||||
|
||||
//查找刚刚插入open_source_projects表中的记录id
|
||||
|
|
|
@ -129,7 +129,7 @@ public interface DBSource {
|
|||
public Synonyms getSynonymById(@Param("table") String table, @Param("id") int id);
|
||||
|
||||
//读取对应于项目ID的synonym对象
|
||||
@Select("select synonyms from ${table} where prjId=#{id} and flag=1;")
|
||||
@Select("select synonyms from ${table} where prjId=#{id} and flag=0;")
|
||||
public List<String> getSynonymByPrjId(@Param("table") String table, @Param("id") int id);
|
||||
|
||||
//transfer中:读取对应于项目ID的synonym对象,只需确定的flag=1别名
|
||||
|
@ -162,8 +162,9 @@ public interface DBSource {
|
|||
public void deleteEddRelationsItem(@Param("table") String table, @Param("id") int id);
|
||||
|
||||
//向edd_relations表插入记录
|
||||
@Insert("insert into ${table} (gather_projects_ids) values (#{gather_projects_ids})")
|
||||
public void insertEddRelations(@Param("table") String table, @Param("gather_projects_ids") String gather_projects_ids);
|
||||
@Insert("insert into ${table} (gather_projects_ids,flag) values (#{gather_projects_ids},#{flag})")
|
||||
public void insertEddRelations(@Param("table") String table, @Param("gather_projects_ids") String gather_projects_ids,
|
||||
@Param("flag") int flag);
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -12,7 +12,7 @@ import com.ossean.model.OpenSourceProject;
|
|||
|
||||
public interface TransferPrjDao {
|
||||
//获取edd_relations表中数据
|
||||
@Select("select * from ${table} where id>=#{start} limit #{size}")
|
||||
@Select("select * from ${table} where id>=#{start} and flag =0 limit #{size}")
|
||||
public List<EddRelations> getEddRelationList(@Param("table") String table, @Param("start") int start, @Param("size") int size);
|
||||
|
||||
//更新edd_relations表的osp_id字段
|
||||
|
|
|
@ -22,8 +22,8 @@ public interface PlatformProjectDao {
|
|||
@Select("select name,star_num,user_num from ${table} where url = #{url} group by url_md5 order by extracted_time desc")
|
||||
public OschinaProject getOschinaPrjByUrl(@Param("table") String table,@Param("url") String url);
|
||||
|
||||
@Select("select id,short_name,name,stargazers_count,forks from github where url = #{url} group by url_md5 order by extracted_time desc")
|
||||
public GithubProject getGithubPrjByUrl(@Param("url") String url);
|
||||
@Select("select id,short_name,name,stargazers_count,forks from ${table} where url = #{url} group by url_md5 order by extracted_time desc")
|
||||
public GithubProject getGithubPrjByUrl(@Param("table") String table,@Param("url") String url);
|
||||
//获取创建时间以比较项目的可信度
|
||||
@Select("select created_time from ${table} where url_md5=#{url_md5}")
|
||||
public List<String> getTime(@Param("table") String table, @Param("created_time") String created_time,@Param("url_md5") String url_md5);
|
||||
|
|
|
@ -59,8 +59,7 @@ public class MergeProjectNew2 {
|
|||
private String sourceforgeTableName = TableName.sourceforgeTableName;
|
||||
private String openhubTableName = TableName.openhubTableName;
|
||||
private String oschinaTableName = TableName.oschinaTableName;
|
||||
// private static GatherProjectsModel gitModel =
|
||||
// null;//gitModel表示重复关系的github项目中最有影响力的Model;先不加入关系中
|
||||
private static GatherProjectsModel gitModel = null;// gitModel表示重复关系的github项目中最有影响力的Model;先不加入关系中
|
||||
|
||||
private static Set<Integer> removeSet;
|
||||
private static Set<Integer> relationSet;
|
||||
|
@ -77,6 +76,10 @@ public class MergeProjectNew2 {
|
|||
|
||||
public int handleNewProject(GatherProjectsModel model, boolean isIncrese) {
|
||||
|
||||
if (gatherDao.selectGPMById(gatherProjectsTableName, model.getId())
|
||||
.getUpdate_mark() != 2 && model.getSource().equals("github") )
|
||||
gitModel = model;
|
||||
|
||||
if (isIncrese) {
|
||||
logger.info("model " + model.getId() + " is increase data");
|
||||
}
|
||||
|
@ -87,9 +90,10 @@ public class MergeProjectNew2 {
|
|||
relationSet = new HashSet<Integer>();// 有重复关系的项目集合
|
||||
removeSet = new HashSet<Integer>();// 已经处理的项目,包括单独插入的和有重复关系的
|
||||
relationSet = findDupProject(model, relationSet, isIncrese);
|
||||
// if(gitModel!=null)
|
||||
// relationSet.add(gitModel.getId());
|
||||
if (gitModel != null)
|
||||
relationSet.add(gitModel.getId());
|
||||
OperateTable(relationSet, isIncrese);// 表操作
|
||||
gitModel = null;
|
||||
return removeSet.size();
|
||||
}
|
||||
|
||||
|
@ -107,14 +111,6 @@ public class MergeProjectNew2 {
|
|||
if (prjHomepage == null || prjHomepage.equals(""))
|
||||
if (model.getSource().equals("apache"))
|
||||
prjHomepage = model.getUrl();
|
||||
// 暂时性对openhub项目进行处理
|
||||
// if(model.getSource().equals("openhub")){
|
||||
// if(model.getHomepage()!=null && !model.getHomepage().equals(""))
|
||||
// if(!isRightUrlByRegex(model.getHomepage())||includeSubStringNum(model.getHomepage(),
|
||||
// "http")>1){
|
||||
// prjHomepage=null;
|
||||
// }
|
||||
// }
|
||||
|
||||
// 项目重复匹配列表
|
||||
List<GatherProjectsModel> matchedList = new ArrayList<GatherProjectsModel>();
|
||||
|
@ -150,15 +146,13 @@ public class MergeProjectNew2 {
|
|||
}
|
||||
}
|
||||
for (GatherProjectsModel m : sameHomepageList) {
|
||||
// logger.info("project " + model.getId() + " and " +
|
||||
// m.getId() + " match by homepage");
|
||||
// logger.info("project " + model.getId() + " and " + m.getId()
|
||||
// + " match by homepage");
|
||||
if (!m.getSource().equals(model.getSource())) {
|
||||
if (sameHomeMap.get(m.getSource()) == null) {
|
||||
if (sameHomeMap.get(m.getSource()) == null)
|
||||
sameHomeMap.put(m.getSource(),
|
||||
new ArrayList<GatherProjectsModel>());
|
||||
sameHomeMap.get(m.getSource()).add(m);
|
||||
} else
|
||||
sameHomeMap.get(m.getSource()).add(m);
|
||||
sameHomeMap.get(m.getSource()).add(m);
|
||||
} else {
|
||||
sameHomeAndSourceList.add(m);
|
||||
}
|
||||
|
@ -172,8 +166,12 @@ public class MergeProjectNew2 {
|
|||
&& tmp.getUpdate_mark() != 2) {
|
||||
// 插入关系表
|
||||
String relationStr = "," + tmp.getId() + ",";
|
||||
dbSource.insertEddRelations(eddRelationTableName,
|
||||
relationStr);
|
||||
if(tmp.getSource().equals("github"))
|
||||
dbSource.insertEddRelations(eddRelationTableName,
|
||||
relationStr,0);
|
||||
else
|
||||
dbSource.insertEddRelations(eddRelationTableName,
|
||||
relationStr,1);
|
||||
gatherDao.updateMark(gatherProjectsTableName, 2,
|
||||
tmp.getId());
|
||||
// handleCount++;
|
||||
|
@ -184,38 +182,41 @@ public class MergeProjectNew2 {
|
|||
String relationStr = "," + model.getId() + ",";
|
||||
relationSet.add(model.getId());
|
||||
return relationSet;
|
||||
} else {
|
||||
// if(uniqueModel.getSource().equals("github")){
|
||||
// if(gitModel!=null)
|
||||
// if(compareGithubProject(uniqueModel,gitModel))
|
||||
// gitModel = uniqueModel;
|
||||
// }
|
||||
}
|
||||
// else {
|
||||
// // if(uniqueModel.getSource().equals("github")){
|
||||
// // if(gitModel!=null)
|
||||
// // if(compareGithubProject(uniqueModel,gitModel))
|
||||
// // gitModel = uniqueModel;
|
||||
// // }
|
||||
// }
|
||||
|
||||
}
|
||||
// 处理查找的项目有同一source的情况,确保一个source最多只有一个项目有相同homepage
|
||||
for (String uniqueSource : sameHomeMap.keySet()) {
|
||||
GatherProjectsModel uniqueModel = getUniqueSourceModel(sameHomeMap
|
||||
.get(uniqueSource));
|
||||
// if(uniqueModel.getSource().equals("github")){
|
||||
// if(gitModel!=null)
|
||||
// if(compareGithubProject(uniqueModel,gitModel))
|
||||
// gitModel = uniqueModel;
|
||||
// }
|
||||
if (uniqueModel.getSource().equals("github")) {
|
||||
if (gitModel == null
|
||||
|| compareGithubProject(uniqueModel, gitModel))
|
||||
gitModel = uniqueModel;
|
||||
}
|
||||
if (!relationSet.contains(uniqueModel.getId())
|
||||
&& !uniqueModel.getSource().equals("github")) {
|
||||
matchedList.add(uniqueModel);
|
||||
matchedIdList.add(uniqueModel.getId());
|
||||
}
|
||||
for (GatherProjectsModel tmp : sameHomeMap.get(uniqueSource)) {
|
||||
if (tmp.getId() != uniqueModel.getId() && tmp.getUpdate_mark() != 2) {
|
||||
if (tmp.getId() != uniqueModel.getId()
|
||||
&& tmp.getUpdate_mark() != 2) {
|
||||
// 插入关系表
|
||||
String relationStr = "," + tmp.getId() + ",";
|
||||
dbSource.insertEddRelations(eddRelationTableName,
|
||||
relationStr);
|
||||
if(tmp.getSource().equals("github"))
|
||||
dbSource.insertEddRelations(eddRelationTableName,relationStr,0);
|
||||
else
|
||||
dbSource.insertEddRelations(eddRelationTableName,relationStr,1);
|
||||
gatherDao.updateMark(gatherProjectsTableName, 2,
|
||||
tmp.getId());
|
||||
// handleCount++;
|
||||
removeSet.add(tmp.getId());
|
||||
}
|
||||
}
|
||||
|
@ -236,8 +237,8 @@ public class MergeProjectNew2 {
|
|||
}
|
||||
for (int i = 0; i < sameNamePrjList.size(); i++) {// 不重复比较
|
||||
int tmpId = sameNamePrjList.get(i).getId();
|
||||
// logger.info("project " + model.getId() + " and " +
|
||||
// sameNamePrjList.get(i).getId() + " match by name");
|
||||
// logger.info("project " + model.getId() + " and "
|
||||
// + sameNamePrjList.get(i).getId() + " match by name");
|
||||
if (!relationSet.contains(tmpId) && !matchedIdList.contains(tmpId)) {// 未处理的
|
||||
afterNamePrjList.add(sameNamePrjList.get(i));
|
||||
}
|
||||
|
@ -254,8 +255,8 @@ public class MergeProjectNew2 {
|
|||
List<GatherProjectsModel> afterSynonymsPrjList = new ArrayList<GatherProjectsModel>();
|
||||
for (int j = 0; j < synonymsPrjList.size(); j++) {// 不重复比较
|
||||
int tmpId = synonymsPrjList.get(j).getId();
|
||||
// logger.info("project " + model.getId() + " and " +
|
||||
// synonymsPrjList.get(j).getId() + " match by synonyms");
|
||||
// logger.info("project " + model.getId() + " and "
|
||||
// + synonymsPrjList.get(j).getId() + " match by synonyms");
|
||||
if (!relationSet.contains(tmpId)
|
||||
&& !afterNamePrjIdList.contains(tmpId)
|
||||
&& !matchedIdList.contains(tmpId))// 未处理的,也不同名,否则会重复计算
|
||||
|
@ -266,10 +267,10 @@ public class MergeProjectNew2 {
|
|||
|
||||
afterSynonymsPrjList = mergeTwoList4Source(afterNamePrjList,
|
||||
afterSynonymsPrjList);
|
||||
// for(GatherProjectsModel m : afterSynonymsPrjList){
|
||||
// logger.info("project " + model.getId() + " and " + m.getId() +
|
||||
// " match by synonyms/name and verify");
|
||||
// }
|
||||
// for (GatherProjectsModel m : afterSynonymsPrjList) {
|
||||
// logger.info("project " + model.getId() + " and " + m.getId()
|
||||
// + " match by synonyms/name and verify");
|
||||
// }
|
||||
matchedList = mergeTwoList4Source(matchedList, afterSynonymsPrjList);// 同一社区的项目相同homepage比名字相同和别名相同优先级更高
|
||||
relationSet.add(model.getId());
|
||||
if (matchedList.size() == 0)
|
||||
|
@ -307,26 +308,24 @@ public class MergeProjectNew2 {
|
|||
mergeList.add(gatherDao.selectGPMById(gatherProjectsTableName, id));
|
||||
}
|
||||
for (GatherProjectsModel model1 : mergeList) {
|
||||
// handleCount++;
|
||||
if (map.get(model1.getSource()) == null) {
|
||||
if (map.get(model1.getSource()) == null)
|
||||
map.put(model1.getSource(),
|
||||
new ArrayList<GatherProjectsModel>());
|
||||
map.get(model1.getSource()).add(model1);
|
||||
} else
|
||||
map.get(model1.getSource()).add(model1);
|
||||
map.get(model1.getSource()).add(model1);
|
||||
}
|
||||
for (String uniqueSource : map.keySet()) {
|
||||
GatherProjectsModel uniqueModel = getUniqueSourceModel(map
|
||||
.get(uniqueSource));
|
||||
for (GatherProjectsModel tmp : map.get(uniqueSource)) {
|
||||
if (tmp.getId()!= uniqueModel.getId()) {
|
||||
if (tmp.getId() != uniqueModel.getId()) {
|
||||
// 插入关系表
|
||||
String tmpStr = "," + tmp.getId() + ",";
|
||||
if (tmp.getUpdate_mark() != 2) {
|
||||
dbSource.insertEddRelations(eddRelationTableName,
|
||||
tmpStr);
|
||||
gatherDao.updateMark(gatherProjectsTableName, 2,
|
||||
tmp.getId());
|
||||
if(tmp.getSource().equals("github"))
|
||||
dbSource.insertEddRelations(eddRelationTableName,tmpStr,0);
|
||||
else
|
||||
dbSource.insertEddRelations(eddRelationTableName,tmpStr,1);
|
||||
gatherDao.updateMark(gatherProjectsTableName, 2,tmp.getId());
|
||||
removeSet.add(tmp.getId());
|
||||
}
|
||||
}
|
||||
|
@ -337,28 +336,28 @@ public class MergeProjectNew2 {
|
|||
relationStr.append(",");
|
||||
}
|
||||
// 增量数据的处理
|
||||
if (isIncrease) {
|
||||
|
||||
for (int id : finalIdList) {// 从关系表取出重复关系
|
||||
EddRelations tmpRelation = dbSource
|
||||
.getEddRelationsByGatherProjectsId(
|
||||
eddRelationTableName, id);
|
||||
if (tmpRelation != null)
|
||||
eddRelationsList.add(tmpRelation);
|
||||
}
|
||||
|
||||
if (eddRelationsList != null && eddRelationsList.size() != 0) {
|
||||
for (EddRelations relation : eddRelationsList) {
|
||||
String tmp = StringHandler.removeFirstComma(relation
|
||||
.getGather_projects_ids());
|
||||
relationStr.append(tmp);
|
||||
}
|
||||
}
|
||||
finallyRelationStr = StringHandler.removeDupIdInStr(relationStr
|
||||
.toString());
|
||||
logger.info(finallyRelationStr);
|
||||
//if (isIncrease) {
|
||||
|
||||
for (int id : finalIdList) {// 从关系表取出重复关系
|
||||
EddRelations tmpRelation = dbSource
|
||||
.getEddRelationsByGatherProjectsId(
|
||||
eddRelationTableName, id);
|
||||
if (tmpRelation != null)
|
||||
eddRelationsList.add(tmpRelation);
|
||||
}
|
||||
|
||||
if (eddRelationsList != null && eddRelationsList.size() != 0) {
|
||||
for (EddRelations relation : eddRelationsList) {
|
||||
String tmp = StringHandler.removeFirstComma(relation
|
||||
.getGather_projects_ids());
|
||||
relationStr.append(tmp);
|
||||
}
|
||||
}
|
||||
finallyRelationStr = StringHandler.removeDupIdInStr(relationStr
|
||||
.toString());
|
||||
//logger.info(finallyRelationStr);
|
||||
|
||||
//}
|
||||
try {
|
||||
if (!finallyRelationStr.equals(",")) {
|
||||
for (int id : finalIdList) {
|
||||
|
@ -370,13 +369,13 @@ public class MergeProjectNew2 {
|
|||
}
|
||||
}
|
||||
dbSource.insertEddRelations(eddRelationTableName,
|
||||
finallyRelationStr);
|
||||
if (isIncrease)// 增量数据的关系,删除以前的,插入最新的关系
|
||||
for (EddRelations relation : eddRelationsList) {
|
||||
logger.info("delete before relations");
|
||||
dbSource.deleteEddRelationsItem(eddRelationTableName,
|
||||
relation.getId());
|
||||
}
|
||||
finallyRelationStr,0);
|
||||
//if (isIncrease)// 增量数据的关系,删除以前的,插入最新的关系
|
||||
for (EddRelations relation : eddRelationsList) {
|
||||
//logger.info("delete before relations");
|
||||
dbSource.deleteEddRelationsItem(eddRelationTableName,
|
||||
relation.getId());
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
|
@ -428,33 +427,32 @@ public class MergeProjectNew2 {
|
|||
Map<String, List<GatherProjectsModel>> map = new HashMap<String, List<GatherProjectsModel>>();
|
||||
GatherProjectsModel uniqueModel = new GatherProjectsModel();
|
||||
for (GatherProjectsModel model1 : returnList) {
|
||||
if (map.get(model1.getSource()) == null) {
|
||||
if (map.get(model1.getSource()) == null)
|
||||
map.put(model1.getSource(),
|
||||
new ArrayList<GatherProjectsModel>());
|
||||
map.get(model1.getSource()).add(model1);
|
||||
} else
|
||||
map.get(model1.getSource()).add(model1);
|
||||
map.get(model1.getSource()).add(model1);
|
||||
}
|
||||
for (String uniqueSource : map.keySet()) {
|
||||
uniqueModel = getUniqueSourceModel(map.get(uniqueSource));
|
||||
|
||||
for (GatherProjectsModel tmp : map.get(uniqueSource)) {
|
||||
if (tmp.getId() != uniqueModel.getId() && tmp.getUpdate_mark() != 2) {
|
||||
if (tmp.getId() != uniqueModel.getId()
|
||||
&& tmp.getUpdate_mark() != 2) {
|
||||
// 插入关系表
|
||||
String relationStr = "," + tmp.getId() + ",";
|
||||
dbSource.insertEddRelations(eddRelationTableName,
|
||||
relationStr);
|
||||
if(tmp.getSource().equals("github"))
|
||||
dbSource.insertEddRelations(eddRelationTableName,relationStr,0);
|
||||
else
|
||||
dbSource.insertEddRelations(eddRelationTableName,relationStr,1);
|
||||
gatherDao.updateMark(gatherProjectsTableName, 2,
|
||||
tmp.getId());
|
||||
// handleCount++;
|
||||
removeSet.add(tmp.getId());
|
||||
}
|
||||
}
|
||||
if (!uniqueModel.getSource().equals("github"))
|
||||
afterReturnList.add(uniqueModel);
|
||||
// else
|
||||
// if(compareGithubProject(uniqueModel, gitModel))
|
||||
// gitModel = uniqueModel;
|
||||
else if (compareGithubProject(uniqueModel, gitModel))
|
||||
gitModel = uniqueModel;
|
||||
|
||||
}
|
||||
return afterReturnList;
|
||||
|
@ -463,9 +461,9 @@ public class MergeProjectNew2 {
|
|||
|
||||
public boolean compareGithubProject(GatherProjectsModel model1,
|
||||
GatherProjectsModel model2) {
|
||||
|
||||
if (platformProjectDao.getGithubPrjByUrl(model1.getUrl())
|
||||
.getStargazers_count() > platformProjectDao.getGithubPrjByUrl(
|
||||
|
||||
if (platformProjectDao.getGithubPrjByUrl(TableName.githubTableName,model1.getUrl())
|
||||
.getStargazers_count() > platformProjectDao.getGithubPrjByUrl(TableName.githubTableName,
|
||||
model2.getUrl()).getStargazers_count())
|
||||
return true;
|
||||
else
|
||||
|
@ -592,7 +590,7 @@ public class MergeProjectNew2 {
|
|||
else {
|
||||
similarity = calSimilarity(model1, model2);
|
||||
// System.out.println("same name to calSimilarityByDescrption between "+model1.getId()+" and "+model2.getId()+" : "+similarity);
|
||||
if (similarity > 0.2)
|
||||
if (similarity > 0.1)
|
||||
return true;
|
||||
else
|
||||
return false;
|
||||
|
@ -603,7 +601,7 @@ public class MergeProjectNew2 {
|
|||
else {
|
||||
similarity = calSimilarity(model1, model2);
|
||||
// System.out.println("same synonyms to calSimilarityByDescrption between "+model1.getId()+" and "+model2.getId()+" : "+similarity);
|
||||
if (similarity >= 0.5)
|
||||
if (similarity >= 0.3)
|
||||
return true;
|
||||
else
|
||||
return false;
|
||||
|
|
|
@ -200,9 +200,9 @@ public class TransferProjectsUtil2 {
|
|||
OpenSourceProject osp = mergeGatherProjectsToOpenSourceProject(gatherProjectList);
|
||||
if(osp != null){
|
||||
if(isUpdate){
|
||||
//logger.info("!!!!!!!!"+gather_projects_ids);
|
||||
//logger.info(osp.getId());
|
||||
logger.info("!!!!!!!!"+gather_projects_ids);
|
||||
OpenSourceProject formerOsp = getFormerOsp(gather_projects_ids);
|
||||
logger.info(osp.getId()+formerOsp.getId());
|
||||
if(formerOsp.getFiltration()==2){
|
||||
osp.setFiltration(2);
|
||||
osp.setUpdate_mark(1);
|
||||
|
@ -280,10 +280,12 @@ public class TransferProjectsUtil2 {
|
|||
tmp_value = 0;
|
||||
else if(source.toLowerCase().equals("oschina"))
|
||||
tmp_value = 1;
|
||||
else if(source.toLowerCase().equals("openhub"))
|
||||
else if(source.toLowerCase().equals("github"))
|
||||
tmp_value = 2;
|
||||
else if(source.toLowerCase().equals("openhub"))
|
||||
tmp_value = 4;
|
||||
else if(source.toLowerCase().equals("sourceforge"))
|
||||
tmp_value = 3;
|
||||
tmp_value = 5;
|
||||
else
|
||||
tmp_value = Integer.MAX_VALUE;
|
||||
if(head > tmp_value){
|
||||
|
|
|
@ -38,7 +38,7 @@
|
|||
destroy-method="close">
|
||||
<property name="driverClassName" value="com.mysql.jdbc.Driver" />
|
||||
<property name="url"
|
||||
value="jdbc:mysql://localhost:3306/ossean_production?characterEncoding=UTF-8&zeroDateTimeBehavior=convertToNull&autoReconnect=true" />
|
||||
value="jdbc:mysql://localhost:3306/ossean?characterEncoding=UTF-8&zeroDateTimeBehavior=convertToNull&autoReconnect=true" />
|
||||
<property name="username" value="root" />
|
||||
<property name="password" value="123456" />
|
||||
<property name="validationQuery" value="SELECT 1" />
|
||||
|
|
Loading…
Reference in New Issue