diff --git a/.gitignore b/.gitignore index 2d61090..a2ee364 100644 --- a/.gitignore +++ b/.gitignore @@ -110,8 +110,10 @@ /gather_program/.settings/ /gather_program/.classpath -#/crawler/dailyScheduledCrawler/fetch_networks/target/ -#/crawler/dailyScheduledCrawler/fetch_networks/log/* -#/crawler/daily_scheduler/log/* -#/crawler/moreSmarterCrawler/fetch_networks/target/ -#/crawler/moreSmarterCrawler/fetch_networks/log/* + +/crawler/dailyScheduledCrawler/fetch_networks/target/ +/crawler/dailyScheduledCrawler/fetch_networks/log/* +/crawler/daily_scheduler/log/* +/crawler/daily_scheduler/log_mem/* +/crawler/moreSmarterCrawler/fetch_networks/target/ +/crawler/moreSmarterCrawler/fetch_networks/log/* diff --git a/gather_program/bin/gather_projects.sh b/gather_program/bin/gather_projects.sh index e8be36a..79ceee0 100644 --- a/gather_program/bin/gather_projects.sh +++ b/gather_program/bin/gather_projects.sh @@ -16,4 +16,4 @@ JVM_ARGS="-Xmn98m -Xmx512m -Xms512m -XX:NewRatio=4 -XX:SurvivorRatio=4 -XX:MaxTe #echo JVM_ARGS=$JVM_ARGS #ulimit -n 400000 #echo "" > nohup.out -java $JVM_ARGS -classpath $CLASSPATH org.ossean.gather.process.GatherProcess gather_projects >>log/gather_projects.log 2>&1 & \ No newline at end of file +java $JVM_ARGS -classpath $CLASSPATH org.ossean.gather.process.GatherProcess gather_projects > log/gather_projects.log 2>&1 & \ No newline at end of file diff --git a/gather_program/bin/job_requirements.sh b/gather_program/bin/job_requirements.sh index 8abfc74..d81149e 100644 --- a/gather_program/bin/job_requirements.sh +++ b/gather_program/bin/job_requirements.sh @@ -17,4 +17,4 @@ JVM_ARGS="-Xmn98m -Xmx512m -Xms512m -XX:NewRatio=4 -XX:SurvivorRatio=4 -XX:MaxTe #echo JVM_ARGS=$JVM_ARGS #ulimit -n 400000 #echo "" > nohup.out -java $JVM_ARGS -classpath $CLASSPATH org.ossean.gather.process.GatherProcess job_requirements >>log/job_requirements.log 2>&1 & \ No newline at end of file +java $JVM_ARGS -classpath $CLASSPATH org.ossean.gather.process.GatherProcess job_requirements > log/job_requirements.log 2>&1 & \ No newline at end of file diff --git a/gather_program/bin/relative_memos.sh b/gather_program/bin/relative_memos.sh index 45d96e6..fbb4326 100644 --- a/gather_program/bin/relative_memos.sh +++ b/gather_program/bin/relative_memos.sh @@ -17,4 +17,4 @@ JVM_ARGS="-Xmn98m -Xmx2048m -Xms1024m -XX:NewRatio=4 -XX:SurvivorRatio=4 -XX:Max #echo JVM_ARGS=$JVM_ARGS #ulimit -n 400000 #echo "" > nohup.out -java $JVM_ARGS -classpath $CLASSPATH org.ossean.gather.process.GatherProcess relative_memos >>log/relative_memos.log 2>&1 & \ No newline at end of file +java $JVM_ARGS -classpath $CLASSPATH org.ossean.gather.process.GatherProcess relative_memos > log/relative_memos.log 2>&1 & \ No newline at end of file diff --git a/gather_program/bin/resources/applicationContext-myBatis.xml b/gather_program/bin/resources/applicationContext-myBatis.xml index 69361cf..96b11e1 100644 --- a/gather_program/bin/resources/applicationContext-myBatis.xml +++ b/gather_program/bin/resources/applicationContext-myBatis.xml @@ -39,9 +39,9 @@ destroy-method="close"> + value="jdbc:mysql://172.16.128.36:3306/ossean_production?characterEncoding=UTF-8&zeroDateTimeBehavior=convertToNull&autoReconnect=true" /> - + diff --git a/gather_program/bin/resources/relative_memos.xml b/gather_program/bin/resources/relative_memos.xml index c0aa136..ea2f60b 100644 --- a/gather_program/bin/resources/relative_memos.xml +++ b/gather_program/bin/resources/relative_memos.xml @@ -3,7 +3,7 @@ TableFlow pointers - oschina_question,slashdot,iteye_blog,51cto_blog,csdn_question,cnblog_news,cnblog_question,csdn_bbs,csdn_blogs,dewen_question,stackoverflow + oschina_question,iteye_blog,51cto_blog,csdn_question,cnblog_news,cnblog_question,csdn_bbs,csdn_blogs,dewen_question,stackoverflow relative_memos id,title,content,created_time,now(),type,tags,source,url,url_md5,author,author_url,view_num,review_num,extracted_time id,title,content,created_time,updated_time,memo_type,tags,source,url,url_md5,author,author_url,view_num,review_num,extracted_time diff --git a/gather_program/sql/settings.sql b/gather_program/sql/settings.sql new file mode 100644 index 0000000..a77a396 --- /dev/null +++ b/gather_program/sql/settings.sql @@ -0,0 +1,29 @@ +/* +Navicat MySQL Data Transfer + +Source Server : ossean +Source Server Version : 50535 +Source Host : 127.0.0.1:3306 +Source Database : ossean_new + +Target Server Type : MYSQL +Target Server Version : 50535 +File Encoding : 65001 + +Date: 2016-11-15 20:04:15 +*/ + +SET FOREIGN_KEY_CHECKS=0; + +-- ---------------------------- +-- Table structure for settings +-- ---------------------------- +DROP TABLE IF EXISTS `settings`; +CREATE TABLE `settings` ( + `id` int(11) NOT NULL AUTO_INCREMENT, + `name` varchar(255) NOT NULL DEFAULT '', + `value` text, + `updated_on` datetime DEFAULT NULL, + PRIMARY KEY (`id`), + KEY `index_settings_on_name` (`name`) USING BTREE +) ENGINE=InnoDB DEFAULT CHARSET=utf8; diff --git a/gather_program/sql/taggings.sql b/gather_program/sql/taggings.sql new file mode 100644 index 0000000..9087bdb --- /dev/null +++ b/gather_program/sql/taggings.sql @@ -0,0 +1,36 @@ +/* +Navicat MySQL Data Transfer + +Source Server : ossean +Source Server Version : 50535 +Source Host : 127.0.0.1:3306 +Source Database : ossean_production + +Target Server Type : MYSQL +Target Server Version : 50535 +File Encoding : 65001 + +Date: 2016-11-13 22:21:05 +*/ + +SET FOREIGN_KEY_CHECKS=0; + +-- ---------------------------- +-- Table structure for taggings +-- ---------------------------- +DROP TABLE IF EXISTS `taggings`; +CREATE TABLE `taggings` ( + `id` int(11) NOT NULL AUTO_INCREMENT, + `tag_id` int(11) NOT NULL, + `taggable_id` int(11) NOT NULL, + `taggable_type` varchar(255) NOT NULL, + `tagger_id` int(11) DEFAULT NULL, + `tagger_type` varchar(255) DEFAULT NULL, + `context` varchar(128) DEFAULT NULL, + `created_at` datetime DEFAULT NULL, + `created_time` datetime DEFAULT NULL, + `disagree_num` int(11) DEFAULT '0', + `tag_source` varchar(255) DEFAULT NULL, + PRIMARY KEY (`id`), + UNIQUE KEY `index_taggings_on_tag_id_and_taggable_id_and_taggable_type` (`tag_id`,`taggable_id`,`taggable_type`) USING BTREE +) ENGINE=InnoDB AUTO_INCREMENT=19315557 DEFAULT CHARSET=utf8; diff --git a/gather_program/src/main/java/org/ossean/gather/process/GatherThread.java b/gather_program/src/main/java/org/ossean/gather/process/GatherThread.java index 7149774..fabb5ef 100644 --- a/gather_program/src/main/java/org/ossean/gather/process/GatherThread.java +++ b/gather_program/src/main/java/org/ossean/gather/process/GatherThread.java @@ -1,363 +1,363 @@ -package org.ossean.gather.process; - -import java.util.List; - -import javax.annotation.Resource; - -import org.apache.log4j.Logger; -import org.ossean.gather.model.Configure; -import org.ossean.gather.model.GatherProject; -import org.ossean.gather.model.JobRequirement; -import org.ossean.gather.model.PKControlPosts; -import org.ossean.gather.model.PKControlProjects; -import org.ossean.gather.model.RelativeMemo; -import org.ossean.gather.model.Taggings; -import org.ossean.gather.sourceDao.GatherDao; -import org.ossean.gather.sourceDao.PKControlPostsDao; -import org.ossean.gather.sourceDao.PKControlProjectsDao; -import org.ossean.gather.targetDao.PointerDao; -import org.ossean.gather.targetDao.TargetDao; -import org.springframework.context.annotation.Scope; -import org.springframework.stereotype.Component; - -@Component("gatherThread") -@Scope("prototype") -public class GatherThread implements Runnable { - private static Logger logger = Logger.getLogger(GatherThread.class); - private Configure conf; - - @Resource - private GatherDao gatherDao; - @Resource - private PointerDao pointerDao; - @Resource - private PKControlPostsDao pkControlPostsDao; - @Resource - private TargetDao targetDao; - @Resource - private PKControlProjectsDao pkControlProjectsDao; - - private int idsBegin; // 转移开始Id值 - private int idsEnd; // 转移结束Id值 - private int idsIncrement;// 每次转移的Id量 - - private int beginId; - private int endId; - - private String sourceTableName; - private String pkControlPostsTableName = "pk_control_posts"; - private String pkControlProjectsTableName = "pk_control_projects"; - private String taggingsTableName = "taggings"; - private String tagsTableName = "tags"; - - private String gatherPostsTableName = "relative_memos"; - private int maxId; - - public void setParameters(Configure conf, String sourceTableName) { - this.conf = conf; - this.sourceTableName = sourceTableName; - } - - // 读指针 - public int readPointer(String table, String source, String target) { - int pointer = 1; - try { - pointer = pointerDao.getPointer(table, source, target); - } catch (Exception e) { - // 表示表中没有数据 - logger.info("No such pointer! Create one"); - pointerDao.insertPointer(table, source, target, 1); - } - return pointer; - } - - @Override - public void run() { -// long start = System.currentTimeMillis(); - Thread.currentThread().setName(sourceTableName); - idsIncrement = conf.getIdsIncrement(); - idsBegin = readPointer(conf.getPointerTableName(), sourceTableName, - conf.getTargetTableName()); - idsEnd = maxId = gatherDao.getMaxId(sourceTableName); - while (idsBegin < idsEnd) { - beginId = idsBegin; - endId = beginId + idsIncrement - 1; // 取数据时两边都取等号 - if (endId <= maxId) { - handleBatchData(beginId, endId, conf); - idsBegin = idsBegin + idsIncrement; - } else { - endId = maxId; // endId应小于maxId - handleBatchData(beginId, endId, conf); - break; - } - } - GatherProcess.gatherState.put(sourceTableName, false); -// long end = System.currentTimeMillis(); -// logger.info((end - start) / 6000); - } - - public void handleBatchData(int beginId, int endId, Configure conf) { - logger.info("BeginId#" + sourceTableName + ":" + beginId); - // 表示任务没有完成 - int maxId = gatherDao.getMaxId(sourceTableName); - // 防止转移超过当前最大值的Id数据 - if (beginId >= 0 && endId > 0 && maxId >= endId) { - // 更新执行开始时间 - logger.info("begin gathering..."); - // 插入Id段数据,忽略重复值 - try { - String[] sourceFields = conf.getSourceFields().split(","); - String[] targetFields = conf.getTargetFields().split(","); - String selectItems = ""; - for (int i = 0; i < sourceFields.length; i++) { - String str_source = sourceFields[i]; - String str_target = targetFields[i]; - selectItems += str_source + " as " + str_target + ","; - } - selectItems = selectItems - .substring(0, selectItems.length() - 1) + " "; - if (conf.getTargetTableName().equals("relative_memos")) { - List dataGet = gatherDao.getPostGatherData( - sourceTableName, selectItems, beginId, endId, - conf.getAndWhere()); - for (int i = 0; i < dataGet.size(); i++) { - RelativeMemo model = dataGet.get(i); - String urlMD5 = model.getUrl_md5();// 通过urlMD5判断是不是已经存在该帖子 - // 是否更新 - int postId = 0; - if(GatherProcess.urlMd5Set.contains(urlMD5)){ - //urlmd5存在则更新 - RelativeMemo samePost = targetDao.findPostByUrlMD5( - conf.getTargetTableName(), urlMD5); - // update gather_projects表中对应的记录,在维持待更新表 - postId = samePost.getId(); - model.setId(postId); - handleUpdateGatherPosts(samePost.getId(), model); - }else{ - // 不存在 插入 - PKControlPosts pkControlModel = pkControlPostsDao - .selectItemByUrlMD5( - pkControlPostsTableName, urlMD5);// 查看有没有固定的id - if (pkControlModel != null) - model.setId(pkControlModel.getId()); - else { - // 在pk_control_posts表中生成当前项目对应的id - pkControlPostsDao.insertOneItem( - pkControlPostsTableName, urlMD5); - // 查看刚刚插入信息的id - PKControlPosts controlItem = pkControlPostsDao - .selectItemByUrlMD5( - pkControlPostsTableName, urlMD5); - // 用id构造model对应的固定不变的id - model.setId(controlItem.getId()); - postId = model.getId(); - } - handleInsertGatherPosts(model, conf); - GatherProcess.urlMd5Set.add(urlMD5); - } - - // 将tag和项目的关系存入表item_tag_relation 并分离tag - String tags = model.getTags(); - if (tags == null) { - // 表示该项目没有标签 - continue; - } - List tagList = DataHandler - .tagsSegmentation(tags); - for (String tag : tagList) { - targetDao.insertTag(tagsTableName, tag);// ignore方式插入该项目的标签 - int tag_id = targetDao.selectTagIdByName( - tagsTableName, tag); - Taggings taggings = new Taggings(); - taggings.setTag_id(tag_id); - taggings.setTaggable_id(postId); - taggings.setTaggable_type("RelativeMemo"); - taggings.setContext("tags"); - taggings.setCreated_at(DataHandler.getNow()); - // 将Taggings对象存入数据库中 - try { - targetDao.insertTaggings(taggingsTableName, - taggings); - } catch (Exception e) { - // 在插入记录之前 relative_memos表中的记录已经被删除掉了 - logger.error(e); - System.exit(0); - } - } - } - } else if (conf.getTargetTableName().equals("gather_projects")) { - List dataGet = gatherDao.getPrjGatherData( - sourceTableName, selectItems, beginId, endId, - conf.getAndWhere()); - for (int i = 0; i < dataGet.size(); i++) { - GatherProject model = dataGet.get(i); - String urlMD5 = model.getUrl_md5();// 通过urlMD5判断是不是已经存在该项目 - // 是否更新 - int prjId = 0; - if(GatherProcess.urlMd5Set.contains(urlMD5)){ - GatherProject samePrj = targetDao.findPrjByUrlMD5( - conf.getTargetTableName(), urlMD5); - // update gather_projects表中对应的记录,在维持待更新表 - prjId = samePrj.getId(); - model.setId(prjId); - model.setUpdate_mark(1); - handleUpdateGatherProjects(samePrj.getId(), model); - }else{ - // 不存在 插入 - PKControlProjects pkControlProjects = pkControlProjectsDao - .selectItemByUrlMD5( - pkControlProjectsTableName, urlMD5);// 查看有没有固定的id - if (pkControlProjects != null) - model.setId(pkControlProjects.getId()); - else { - // 在pk_control_posts表中生成当前项目对应的id - pkControlProjectsDao.insertOneItem( - pkControlProjectsTableName, urlMD5); - // 查看刚刚插入信息的id - PKControlProjects controlItem = pkControlProjectsDao - .selectItemByUrlMD5( - pkControlProjectsTableName, - urlMD5); - // 用id构造model对应的固定不变的id - model.setId(controlItem.getId()); - prjId = model.getId(); - } - model.setUpdate_mark(0); - handleInsertGatherProjects(model, conf); - GatherProcess.urlMd5Set.add(urlMD5); - } - -// // 将tag和项目的关系存入表item_tag_relation 并分离tag -// String tags = model.getTags(); -// if (tags == null) { -// // 表示该项目没有标签 -// continue; -// } -// List tagList = DataHandler -// .tagsSegmentation(tags); -// for (String tag : tagList) { -// targetDao.insertTag(tagsTableName, tag);// ignore方式插入该项目的标签 -// int tag_id = targetDao.selectTagIdByName( -// tagsTableName, tag); -// Taggings taggings = new Taggings(); -// taggings.setTag_id(tag_id); -// taggings.setTaggable_id(prjId); -// taggings.setTaggable_type("OpenSourceProject"); -// taggings.setContext("tags"); -// taggings.setCreated_at(DataHandler.getNow()); -// // 将Taggings对象存入数据库中 -// try { -// targetDao.insertTaggings(taggingsTableName, -// taggings); -// } catch (Exception e) { -// // 在插入记录之前 relative_memos表中的记录已经被删除掉了 -// logger.error(e); -// System.exit(0); -// } -// } - } - } else { - List dataGet = gatherDao.getJobGatherData( - sourceTableName, selectItems, beginId, endId, - conf.getAndWhere()); - for (int i = 0; i < dataGet.size(); i++) { - JobRequirement model = dataGet.get(i); - String urlMD5 = model.getUrl_md5();// 通过urlMD5判断是不是已经存在该帖子 - // 是否更新 - int postId = 0; - if(GatherProcess.urlMd5Set.contains(urlMD5)){ - JobRequirement sameJob = targetDao.findJobByUrlMD5( - conf.getTargetTableName(), urlMD5); - // update gather_projects表中对应的记录,在维持待更新表 - postId = sameJob.getId(); - model.setId(postId); - handleUpdateGatherJobs(sameJob.getId(), model); - }else{ - // 不存在 插入 - PKControlPosts pkControlPosts = pkControlPostsDao - .selectItemByUrlMD5( - pkControlPostsTableName, urlMD5);// 查看有没有固定的id - if (pkControlPosts != null) - model.setId(pkControlPosts.getId()); - else { - // 在pk_control_posts表中生成当前项目对应的id - pkControlPostsDao.insertOneItem( - pkControlPostsTableName, urlMD5); - // 查看刚刚插入信息的id - PKControlPosts controlItem = pkControlPostsDao - .selectItemByUrlMD5( - pkControlPostsTableName, urlMD5); - // 用id构造model对应的固定不变的id - model.setId(controlItem.getId()); - postId = model.getId(); - } - handleInsertGatherJobs(model, conf); - GatherProcess.urlMd5Set.add(urlMD5); - } - } - } - - } catch (Exception ex) { - // 数据迁移过程可能发生异常情况 - logger.error(ex); - System.exit(0); - } - - // 更新游标到本次 EndId+1; - pointerDao.updatePointer(conf.getPointerTableName(), - sourceTableName, conf.getTargetTableName(), endId + 1);// sourceIdBegin - // + - // idsIncrement - logger.info("current--" + sourceTableName + ": " + endId); - } - } - - // 处理URL不存在的帖子 插入relative_memos表 - public void handleInsertGatherPosts(RelativeMemo model, Configure conf) { - try { - targetDao.insertRelativeMemo(conf.getTargetTableName(), - conf.getTargetFields(), model); - } catch (Exception e) { - logger.error(e); - } - } - - // 处理URL相同的帖子更新 id表示更新的帖子固定id - public void handleUpdateGatherPosts(int id, RelativeMemo model_new) { - targetDao.updateRelativeMemo(gatherPostsTableName, model_new, id);// 更新数据relative_memos表 - } - - // 处理URL不存在的项目 插入gather_projects表 - public void handleInsertGatherProjects(GatherProject model, Configure conf) { - try { - targetDao.insertOpenSourceProject(conf.getTargetTableName(), - conf.getTargetFields(), model); - } catch (Exception e) { - logger.error(e); - } - - } - - // 处理URL相同的项目更新 id表示更新的项目固定id - public void handleUpdateGatherProjects(int id, GatherProject model_new) { - targetDao.updateOpenSourceProject(conf.getTargetTableName(), model_new, - id);// 更新数据gather_projects表 - } - - // 处理URL不存在的项目 插入job_requirements表 - public void handleInsertGatherJobs(JobRequirement model, Configure conf) { - try { - targetDao.insertJobRequirement(conf.getTargetTableName(), - conf.getTargetFields(), model); - } catch (Exception e) { - logger.error(e); - } - - } - - // 处理URL相同的项目更新 id表示更新的项目固定id - public void handleUpdateGatherJobs(int id, JobRequirement model_new) { - targetDao - .updateJobRequirement(conf.getTargetTableName(), model_new, id);// 更新数据job_requirements表 - } -} +package org.ossean.gather.process; + +import java.util.List; + +import javax.annotation.Resource; + +import org.apache.log4j.Logger; +import org.ossean.gather.model.Configure; +import org.ossean.gather.model.GatherProject; +import org.ossean.gather.model.JobRequirement; +import org.ossean.gather.model.PKControlPosts; +import org.ossean.gather.model.PKControlProjects; +import org.ossean.gather.model.RelativeMemo; +import org.ossean.gather.model.Taggings; +import org.ossean.gather.sourceDao.GatherDao; +import org.ossean.gather.sourceDao.PKControlPostsDao; +import org.ossean.gather.sourceDao.PKControlProjectsDao; +import org.ossean.gather.targetDao.PointerDao; +import org.ossean.gather.targetDao.TargetDao; +import org.springframework.context.annotation.Scope; +import org.springframework.stereotype.Component; + +@Component("gatherThread") +@Scope("prototype") +public class GatherThread implements Runnable { + private static Logger logger = Logger.getLogger(GatherThread.class); + private Configure conf; + + @Resource + private GatherDao gatherDao; + @Resource + private PointerDao pointerDao; + @Resource + private PKControlPostsDao pkControlPostsDao; + @Resource + private TargetDao targetDao; + @Resource + private PKControlProjectsDao pkControlProjectsDao; + + private int idsBegin; // 转移开始Id值 + private int idsEnd; // 转移结束Id值 + private int idsIncrement;// 每次转移的Id量 + + private int beginId; + private int endId; + + private String sourceTableName; + private String pkControlPostsTableName = "pk_control_posts"; + private String pkControlProjectsTableName = "pk_control_projects"; + private String memoTaggingsTableName = "memo_taggings"; + private String tagsTableName = "tags"; + + private String gatherPostsTableName = "relative_memos"; + private int maxId; + + public void setParameters(Configure conf, String sourceTableName) { + this.conf = conf; + this.sourceTableName = sourceTableName; + } + + // 读指针 + public int readPointer(String table, String source, String target) { + int pointer = 1; + try { + pointer = pointerDao.getPointer(table, source, target); + } catch (Exception e) { + // 表示表中没有数据 + logger.info("No such pointer! Create one"); + pointerDao.insertPointer(table, source, target, 1); + } + return pointer; + } + + @Override + public void run() { +// long start = System.currentTimeMillis(); + Thread.currentThread().setName(sourceTableName); + idsIncrement = conf.getIdsIncrement(); + idsBegin = readPointer(conf.getPointerTableName(), sourceTableName, + conf.getTargetTableName()); + idsEnd = maxId = gatherDao.getMaxId(sourceTableName); + while (idsBegin < idsEnd) { + beginId = idsBegin; + endId = beginId + idsIncrement - 1; // 取数据时两边都取等号 + if (endId <= maxId) { + handleBatchData(beginId, endId, conf); + idsBegin = idsBegin + idsIncrement; + } else { + endId = maxId; // endId应小于maxId + handleBatchData(beginId, endId, conf); + break; + } + } + GatherProcess.gatherState.put(sourceTableName, false); +// long end = System.currentTimeMillis(); +// logger.info((end - start) / 6000); + } + + public void handleBatchData(int beginId, int endId, Configure conf) { + logger.info("BeginId#" + sourceTableName + ":" + beginId); + // 表示任务没有完成 + int maxId = gatherDao.getMaxId(sourceTableName); + // 防止转移超过当前最大值的Id数据 + if (beginId >= 0 && endId > 0 && maxId >= endId) { + // 更新执行开始时间 + logger.info("begin gathering..."); + // 插入Id段数据,忽略重复值 + try { + String[] sourceFields = conf.getSourceFields().split(","); + String[] targetFields = conf.getTargetFields().split(","); + String selectItems = ""; + for (int i = 0; i < sourceFields.length; i++) { + String str_source = sourceFields[i]; + String str_target = targetFields[i]; + selectItems += str_source + " as " + str_target + ","; + } + selectItems = selectItems + .substring(0, selectItems.length() - 1) + " "; + if (conf.getTargetTableName().equals("relative_memos")) { + List dataGet = gatherDao.getPostGatherData( + sourceTableName, selectItems, beginId, endId, + conf.getAndWhere()); + for (int i = 0; i < dataGet.size(); i++) { + RelativeMemo model = dataGet.get(i); + String urlMD5 = model.getUrl_md5();// 通过urlMD5判断是不是已经存在该帖子 + // 是否更新 + int postId = 0; + if(GatherProcess.urlMd5Set.contains(urlMD5)){ + //urlmd5存在则更新 + RelativeMemo samePost = targetDao.findPostByUrlMD5( + conf.getTargetTableName(), urlMD5); + // update gather_projects表中对应的记录,在维持待更新表 + postId = samePost.getId(); + model.setId(postId); + handleUpdateGatherPosts(samePost.getId(), model); + }else{ + // 不存在 插入 + PKControlPosts pkControlModel = pkControlPostsDao + .selectItemByUrlMD5( + pkControlPostsTableName, urlMD5);// 查看有没有固定的id + if (pkControlModel != null) + model.setId(pkControlModel.getId()); + else { + // 在pk_control_posts表中生成当前项目对应的id + pkControlPostsDao.insertOneItem( + pkControlPostsTableName, urlMD5); + // 查看刚刚插入信息的id + PKControlPosts controlItem = pkControlPostsDao + .selectItemByUrlMD5( + pkControlPostsTableName, urlMD5); + // 用id构造model对应的固定不变的id + model.setId(controlItem.getId()); + postId = model.getId(); + } + handleInsertGatherPosts(model, conf); + GatherProcess.urlMd5Set.add(urlMD5); + } + + // 将tag和项目的关系存入表item_tag_relation 并分离tag + String tags = model.getTags(); + if (tags == null) { + // 表示该项目没有标签 + continue; + } + List tagList = DataHandler + .tagsSegmentation(tags); + for (String tag : tagList) { + targetDao.insertTag(tagsTableName, tag);// ignore方式插入该项目的标签 + int tag_id = targetDao.selectTagIdByName( + tagsTableName, tag); + Taggings taggings = new Taggings(); + taggings.setTag_id(tag_id); + taggings.setTaggable_id(postId); + taggings.setTaggable_type("RelativeMemo"); + taggings.setContext("tags"); + taggings.setCreated_at(DataHandler.getNow()); + // 将Taggings对象存入数据库中 + try { + targetDao.insertTaggings(memoTaggingsTableName, + taggings); + } catch (Exception e) { + // 在插入记录之前 relative_memos表中的记录已经被删除掉了 + logger.error(e); + System.exit(0); + } + } + } + } else if (conf.getTargetTableName().equals("gather_projects")) { + List dataGet = gatherDao.getPrjGatherData( + sourceTableName, selectItems, beginId, endId, + conf.getAndWhere()); + for (int i = 0; i < dataGet.size(); i++) { + GatherProject model = dataGet.get(i); + String urlMD5 = model.getUrl_md5();// 通过urlMD5判断是不是已经存在该项目 + // 是否更新 + int prjId = 0; + if(GatherProcess.urlMd5Set.contains(urlMD5)){ + GatherProject samePrj = targetDao.findPrjByUrlMD5( + conf.getTargetTableName(), urlMD5); + // update gather_projects表中对应的记录,在维持待更新表 + prjId = samePrj.getId(); + model.setId(prjId); + model.setUpdate_mark(2); + handleUpdateGatherProjects(samePrj.getId(), model); + }else{ + // 不存在 插入 + PKControlProjects pkControlProjects = pkControlProjectsDao + .selectItemByUrlMD5( + pkControlProjectsTableName, urlMD5);// 查看有没有固定的id + if (pkControlProjects != null) + model.setId(pkControlProjects.getId()); + else { + // 在pk_control_posts表中生成当前项目对应的id + pkControlProjectsDao.insertOneItem( + pkControlProjectsTableName, urlMD5); + // 查看刚刚插入信息的id + PKControlProjects controlItem = pkControlProjectsDao + .selectItemByUrlMD5( + pkControlProjectsTableName, + urlMD5); + // 用id构造model对应的固定不变的id + model.setId(controlItem.getId()); + prjId = model.getId(); + } + model.setUpdate_mark(0); + handleInsertGatherProjects(model, conf); + GatherProcess.urlMd5Set.add(urlMD5); + } + +// // 将tag和项目的关系存入表item_tag_relation 并分离tag +// String tags = model.getTags(); +// if (tags == null) { +// // 表示该项目没有标签 +// continue; +// } +// List tagList = DataHandler +// .tagsSegmentation(tags); +// for (String tag : tagList) { +// targetDao.insertTag(tagsTableName, tag);// ignore方式插入该项目的标签 +// int tag_id = targetDao.selectTagIdByName( +// tagsTableName, tag); +// Taggings taggings = new Taggings(); +// taggings.setTag_id(tag_id); +// taggings.setTaggable_id(prjId); +// taggings.setTaggable_type("OpenSourceProject"); +// taggings.setContext("tags"); +// taggings.setCreated_at(DataHandler.getNow()); +// // 将Taggings对象存入数据库中 +// try { +// targetDao.insertTaggings(taggingsTableName, +// taggings); +// } catch (Exception e) { +// // 在插入记录之前 relative_memos表中的记录已经被删除掉了 +// logger.error(e); +// System.exit(0); +// } +// } + } + } else { + List dataGet = gatherDao.getJobGatherData( + sourceTableName, selectItems, beginId, endId, + conf.getAndWhere()); + for (int i = 0; i < dataGet.size(); i++) { + JobRequirement model = dataGet.get(i); + String urlMD5 = model.getUrl_md5();// 通过urlMD5判断是不是已经存在该帖子 + // 是否更新 + int postId = 0; + if(GatherProcess.urlMd5Set.contains(urlMD5)){ + JobRequirement sameJob = targetDao.findJobByUrlMD5( + conf.getTargetTableName(), urlMD5); + // update gather_projects表中对应的记录,在维持待更新表 + postId = sameJob.getId(); + model.setId(postId); + handleUpdateGatherJobs(sameJob.getId(), model); + }else{ + // 不存在 插入 + PKControlPosts pkControlPosts = pkControlPostsDao + .selectItemByUrlMD5( + pkControlPostsTableName, urlMD5);// 查看有没有固定的id + if (pkControlPosts != null) + model.setId(pkControlPosts.getId()); + else { + // 在pk_control_posts表中生成当前项目对应的id + pkControlPostsDao.insertOneItem( + pkControlPostsTableName, urlMD5); + // 查看刚刚插入信息的id + PKControlPosts controlItem = pkControlPostsDao + .selectItemByUrlMD5( + pkControlPostsTableName, urlMD5); + // 用id构造model对应的固定不变的id + model.setId(controlItem.getId()); + postId = model.getId(); + } + handleInsertGatherJobs(model, conf); + GatherProcess.urlMd5Set.add(urlMD5); + } + } + } + + } catch (Exception ex) { + // 数据迁移过程可能发生异常情况 + logger.error(ex); + System.exit(0); + } + + // 更新游标到本次 EndId+1; + pointerDao.updatePointer(conf.getPointerTableName(), + sourceTableName, conf.getTargetTableName(), endId + 1);// sourceIdBegin + // + + // idsIncrement + logger.info("current--" + sourceTableName + ": " + endId); + } + } + + // 处理URL不存在的帖子 插入relative_memos表 + public void handleInsertGatherPosts(RelativeMemo model, Configure conf) { + try { + targetDao.insertRelativeMemo(conf.getTargetTableName(), + conf.getTargetFields(), model); + } catch (Exception e) { + logger.error(e); + } + } + + // 处理URL相同的帖子更新 id表示更新的帖子固定id + public void handleUpdateGatherPosts(int id, RelativeMemo model_new) { + targetDao.updateRelativeMemo(gatherPostsTableName, model_new, id);// 更新数据relative_memos表 + } + + // 处理URL不存在的项目 插入gather_projects表 + public void handleInsertGatherProjects(GatherProject model, Configure conf) { + try { + targetDao.insertOpenSourceProject(conf.getTargetTableName(), + conf.getTargetFields(), model); + } catch (Exception e) { + logger.error(e); + } + + } + + // 处理URL相同的项目更新 id表示更新的项目固定id + public void handleUpdateGatherProjects(int id, GatherProject model_new) { + targetDao.updateOpenSourceProject(conf.getTargetTableName(), model_new, + id);// 更新数据gather_projects表 + } + + // 处理URL不存在的项目 插入job_requirements表 + public void handleInsertGatherJobs(JobRequirement model, Configure conf) { + try { + targetDao.insertJobRequirement(conf.getTargetTableName(), + conf.getTargetFields(), model); + } catch (Exception e) { + logger.error(e); + } + + } + + // 处理URL相同的项目更新 id表示更新的项目固定id + public void handleUpdateGatherJobs(int id, JobRequirement model_new) { + targetDao + .updateJobRequirement(conf.getTargetTableName(), model_new, id);// 更新数据job_requirements表 + } +} diff --git a/match_program/bin/resources/applicationContext_mybatis.xml b/match_program/bin/resources/applicationContext_mybatis.xml index 04ac8c6..65e88cf 100644 --- a/match_program/bin/resources/applicationContext_mybatis.xml +++ b/match_program/bin/resources/applicationContext_mybatis.xml @@ -36,9 +36,9 @@ destroy-method="close"> + value="jdbc:mysql://172.16.128.36:3306/ossean_production?characterEncoding=UTF-8&zeroDateTimeBehavior=convertToNull&autoReconnect=true" /> - + diff --git a/match_program/src/main/java/com/ossean/match/lucene/LuceneIndex.java b/match_program/src/main/java/com/ossean/match/lucene/LuceneIndex.java index e8aa53c..d4c27f9 100644 --- a/match_program/src/main/java/com/ossean/match/lucene/LuceneIndex.java +++ b/match_program/src/main/java/com/ossean/match/lucene/LuceneIndex.java @@ -17,7 +17,6 @@ import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; -import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.index.Term; import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.queryparser.classic.QueryParser; @@ -36,11 +35,9 @@ import org.wltea.analyzer.lucene.IKAnalyzer; import com.ossean.match.dao.ProjectDao; import com.ossean.match.dao.RelativeMemoDao; -import com.ossean.match.matchprocess.CountFrequency; import com.ossean.match.matchprocess.MatchIncrement; import com.ossean.match.model.Project; import com.ossean.match.model.RelativeMemo; -import com.ossean.match.model.Tag; import com.ossean.match.utils.Normalizer; @Component("luceneindex") @@ -84,36 +81,6 @@ public class LuceneIndex { iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); IndexWriter iw = new IndexWriter(dire, iwc); return iw; - } - - public static void buildTagIndex(List tags) { - Logger logger1 = LoggerFactory.getLogger(LuceneIndex.class); - Directory dir; - Analyzer analyzer = new IKAnalyzer(true); - try { - dir = FSDirectory.open(Paths.get(CountFrequency.tagIndexDir)); - IndexWriterConfig iwc = new IndexWriterConfig(analyzer); - iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); - IndexWriter writer = new IndexWriter(dir, iwc); - for (Tag tag : tags) { - // 对标签建立索引 - Document doc = new Document(); - String name = tag.getName().toLowerCase(); - // 标签原始名字进行存储 - doc.add(new StringField("name", "" + name, Field.Store.YES)); - doc.add(new StringField("id", "" + tag.getId(), Field.Store.YES)); - // 对标签进行分词 - doc.add(new TextField("items", Normalizer.normalize(name), - Field.Store.NO)); - writer.addDocument(doc); - } - writer.close(); - - } catch (IOException e) { - logger1.error("buildTagIndex IOException: " + e); - } - // Analyzer analyzer = new SimpleAnalyzer(); - } /** @@ -172,9 +139,6 @@ public class LuceneIndex { endId += step; } } - matchIncrement.setPrjHistory(0); - matchIncrement.setPrjIndexed(maxPrjId); - matchIncrement.writeIntoRecord(); writer.commit(); writer.close(); } @@ -225,9 +189,7 @@ public class LuceneIndex { endId += step; } } - matchIncrement.setPrjHistory(0); matchIncrement.setMemoHistory(lastMemoId); - matchIncrement.setMemoIndexed(lastMemoId); matchIncrement.writeIntoRecord(); /*FileOutputStream fout = new FileOutputStream(new File("record.txt")); fout.write((0 + "\t" + lastMemoId + "\t" + 0 + "\t" + 0).getBytes()); //初始化record.txt,项目初始id为0,帖子初始id为创建索引的最后一个帖子的id @@ -256,7 +218,7 @@ public class LuceneIndex { continue; } Document doc = new Document(); - doc.add(new StringField(memoIdFieldName, String.valueOf(rsId), Store.YES)); + doc.add(new StringField(memoIdFieldName, String.valueOf(rsId) + "", Store.YES)); doc.add(new TextField(titleFieldName, memo.getTitle(), Store.NO)); String memoTagsString = memo.getTags(); if (memoTagsString == null) { @@ -266,7 +228,7 @@ public class LuceneIndex { for(String memoTag : memoTagsList){ doc.add(new StringField(memoTagsFieldName, memoTag, Store.NO)); } - indexWriter.addDocument(doc); + indexWriter.updateDocument(new Term(memoIdFieldName, String.valueOf(rsId) + ""), doc); } indexWriter.commit(); indexWriter.close(); diff --git a/match_program/src/main/java/com/ossean/match/lucene/LuceneSearch.java b/match_program/src/main/java/com/ossean/match/lucene/LuceneSearch.java index df043b6..66794e1 100644 --- a/match_program/src/main/java/com/ossean/match/lucene/LuceneSearch.java +++ b/match_program/src/main/java/com/ossean/match/lucene/LuceneSearch.java @@ -7,11 +7,8 @@ import java.util.List; import javax.annotation.Resource; import org.apache.lucene.document.Document; -import org.apache.lucene.index.FieldInvertState; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; -import org.apache.lucene.queryparser.classic.ParseException; -import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.Explanation; @@ -24,10 +21,8 @@ import org.apache.lucene.search.similarities.DefaultSimilarity; import org.apache.lucene.search.similarities.Similarity; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.wltea.analyzer.lucene.IKAnalyzer; import com.ossean.match.dao.ProjectDao; -import com.ossean.match.model.Project; import com.ossean.match.utils.Normalizer; public class LuceneSearch { @@ -89,7 +84,7 @@ public class LuceneSearch { } query.add(tq, BooleanClause.Occur.SHOULD); } - TopDocs td = is.search(query, 3); + TopDocs td = is.search(query, 10000); ScoreDoc[] sds = td.scoreDocs; for (ScoreDoc sd : sds) { Document d = is.doc(sd.doc); @@ -98,10 +93,6 @@ public class LuceneSearch { for(String prjName : prjNames){ if (keyWords.contains(prjName)) { int pId = Integer.parseInt(prjId); - Project currentPrj = projectDao.getPrjById(pId); - if(currentPrj.getFiltration()==0){ - - } if (matchMap.containsKey(pId)) { matchMap.put(pId, matchMap.get(pId) + weight + sd.score/1000); } else @@ -130,59 +121,45 @@ public class LuceneSearch { */ public static HashMap prjToMemoMatchByLucene( String prjName, String searchField, double weight, - HashMap map, IndexReader memoIndexReader, IndexReader prjIndexReader) { + HashMap map, IndexReader memoIndexReader) { try { IndexSearcher is = new IndexSearcher(memoIndexReader); List prjNameList = Normalizer.getList(prjName); BooleanQuery query = new BooleanQuery(); -// Similarity similarity = new DefaultSimilarity(){ -// @Override -// public float queryNorm(float sumOfSquaredWeights) { -// return 1.0f; -// } -// @Override -// public float lengthNorm(FieldInvertState state) { -// return 1.0f; -// } -// }; -// is.setSimilarity(similarity); for(String prjNameTerm : prjNameList){ - //Term termForFreq = new Term(LuceneIndex.prjNameFieldName, prjNameTerm); Term term = new Term(searchField, prjNameTerm); TermQuery tq = new TermQuery(term); -// double curTermDocFreq = prjIndexReader.docFreq(termForFreq); -// if (prjNameList.size() > 1 && curTermDocFreq > 10) { -// tq.setBoost((float) (1/(curTermDocFreq/10))); -// } -// else { -// tq.setBoost(1.1f); -// } - query.add(tq, BooleanClause.Occur.MUST); + query.add(tq, BooleanClause.Occur.MUST); //项目名分词后的每个term都必须在帖子中出现 } TopDocs td = is.search(query, 1000000); ScoreDoc[] sds = td.scoreDocs; for (ScoreDoc sd : sds) { Document d = is.doc(sd.doc); -// if (sd.score >= 0.7) { String postId = d.get(LuceneIndex.memoIdFieldName); int pId = Integer.parseInt(postId); if (map.containsKey(pId)) { map.put(pId, map.get(pId) + weight + sd.score/1000); } else map.put(pId, weight + sd.score/1000); -// } } } catch (IOException e) { logger.error("prjToMemoMatchByLucene IOException: " + e); - } /*catch (ParseException e) { - logger.error("prjToMemoMatchByLucene ParseException: " + e); - }*/ + } return map; } - public static HashMap searchMemoTags(String tagStr, String searchField, double weight, + /** + * 项目名和项目别名与帖子标签之间的匹配 + * @param tagStr + * @param searchField + * @param weight + * @param map + * @param indexReader + * @return + */ + public static HashMap searchMemoTags(String tagStr, String searchField, String idField, double weight, HashMap map, IndexReader indexReader) { try { @@ -193,11 +170,11 @@ public class LuceneSearch { ScoreDoc[] sds = td.scoreDocs; for (ScoreDoc sd : sds) { Document d = is.doc(sd.doc); - String postId = d.get(LuceneIndex.memoIdFieldName); + String postId = d.get(idField); int pId = Integer.parseInt(postId); - if (map.containsKey(pId)) { + if (map.containsKey(pId)) { map.put(pId, map.get(pId) + weight); - } else { + } else { map.put(pId, weight); } } @@ -207,6 +184,16 @@ public class LuceneSearch { return map; } + /** + * 项目标签与帖子标签 + * @param tagStr + * @param searchField + * @param idField + * @param weight + * @param map + * @param indexReader + * @return + */ public static HashMap searchByPrjTag(String tagStr, String searchField, String idField, double weight, HashMap map, IndexReader indexReader) { @@ -220,7 +207,7 @@ public class LuceneSearch { Document d = is.doc(sd.doc); String postId = d.get(idField); int pId = Integer.parseInt(postId); - if (map.containsKey(pId)) { + if (map.containsKey(pId)) { //有项目名和项目别名的匹配时才加入标签匹配的结果 map.put(pId, map.get(pId) + weight); } } @@ -230,6 +217,16 @@ public class LuceneSearch { return map; } + /** + * 项目标签搜索帖子标题 + * @param tagStr + * @param searchField + * @param idField + * @param weight + * @param map + * @param indexReader + * @return + */ public static HashMap searchByPrjTagInMemoTitle(String tagStr, String searchField, String idField, double weight, HashMap map, IndexReader indexReader) { @@ -260,34 +257,34 @@ public class LuceneSearch { // 获得每个帖子匹配到的标签个数 - public static HashMap tagsMatch(String idField, - String tags, String searchField, IndexReader indexReader) { - HashMap tagsMatchNum = new HashMap(); - try { - IndexSearcher is = new IndexSearcher(indexReader); - QueryParser parser = new QueryParser(searchField, new IKAnalyzer(true)); - Query query = parser.parse(tags); - TopDocs td = is.search(query, 100000); - ScoreDoc[] sds = td.scoreDocs; - for (ScoreDoc sd : sds) { - Document d = is.doc(sd.doc); - String postId = d.get(idField); - int pId = Integer.parseInt(postId); - Explanation explanation = is.explain(query, sd.doc); - int hitNum = getHitTermsNum(explanation); - if (tagsMatchNum.containsKey(pId)) { - tagsMatchNum.put(pId, tagsMatchNum.get(pId) + hitNum); - } else - tagsMatchNum.put(pId, hitNum); - } - } catch (IOException e) { - logger.error("tagsMatch IOException: " + e); - } catch (ParseException e) { - logger.error("tagsMatch ParseException: " + e); - } - - return tagsMatchNum; - } +// public static HashMap tagsMatch(String idField, +// String tags, String searchField, IndexReader indexReader) { +// HashMap tagsMatchNum = new HashMap(); +// try { +// IndexSearcher is = new IndexSearcher(indexReader); +// QueryParser parser = new QueryParser(searchField, new IKAnalyzer(true)); +// Query query = parser.parse(tags); +// TopDocs td = is.search(query, 100000); +// ScoreDoc[] sds = td.scoreDocs; +// for (ScoreDoc sd : sds) { +// Document d = is.doc(sd.doc); +// String postId = d.get(idField); +// int pId = Integer.parseInt(postId); +// Explanation explanation = is.explain(query, sd.doc); +// int hitNum = getHitTermsNum(explanation); +// if (tagsMatchNum.containsKey(pId)) { +// tagsMatchNum.put(pId, tagsMatchNum.get(pId) + hitNum); +// } else +// tagsMatchNum.put(pId, hitNum); +// } +// } catch (IOException e) { +// logger.error("tagsMatch IOException: " + e); +// } catch (ParseException e) { +// logger.error("tagsMatch ParseException: " + e); +// } +// +// return tagsMatchNum; +// } //get the number of terms hitted in docs public static int getHitTermsNum(Explanation explanation){ diff --git a/match_program/src/main/java/com/ossean/match/main/Main.java b/match_program/src/main/java/com/ossean/match/main/Main.java index 18557c2..f2d53df 100644 --- a/match_program/src/main/java/com/ossean/match/main/Main.java +++ b/match_program/src/main/java/com/ossean/match/main/Main.java @@ -9,7 +9,6 @@ import org.springframework.context.support.ClassPathXmlApplicationContext; import org.springframework.stereotype.Component; import com.ossean.match.lucene.LuceneIndex; -import com.ossean.match.matchprocess.CountFrequency; import com.ossean.match.matchprocess.MatchIncrement; import com.ossean.match.matchprocess.MemoToPrjMatch; import com.ossean.match.matchprocess.NewPrjMonitor; @@ -29,10 +28,6 @@ public class Main { @Autowired private PrjToMemoMatch prjToMemoMatch; - @Qualifier("countfrequency") - @Autowired - private CountFrequency countFrequency; - @Qualifier("luceneindex") @Autowired private LuceneIndex luceneIndex; @@ -49,31 +44,17 @@ public class Main { public void start() throws InterruptedException, ParseException { while (true) { MatchIncrement matchIncrement = new MatchIncrement(); - if (matchIncrement.getTaskToPrjId() != 0) { //判断是否设置了截止项目id - prjToMemoMatch.setMatchIncrement(matchIncrement); - countFrequency.setMatchIncrement(matchIncrement); - if (matchIncrement.getPrjHistory() == 0 && matchIncrement.getMemoHistory() == 0) { - luceneIndex.run(matchIncrement); - } - newPrjMonitor.run(); - countFrequency.run(); - prjToMemoMatch.run(); - } - else { - newPrjMonitor.setMatchIncrement(matchIncrement); - memoToPrjMatch.setMatchIncrement(matchIncrement); - prjToMemoMatch.setMatchIncrement(matchIncrement); - countFrequency.setMatchIncrement(matchIncrement); - - if (matchIncrement.getPrjHistory() == 0 && matchIncrement.getMemoHistory() == 0) { - luceneIndex.run(matchIncrement); - } - - newPrjMonitor.run(); - countFrequency.run(); - prjToMemoMatch.run(); - memoToPrjMatch.run(); + newPrjMonitor.setMatchIncrement(matchIncrement); + memoToPrjMatch.setMatchIncrement(matchIncrement); + prjToMemoMatch.setMatchIncrement(matchIncrement); + + if (matchIncrement.getMemoHistory() == 0) { + luceneIndex.run(matchIncrement); } + + newPrjMonitor.run(); + prjToMemoMatch.run(); + memoToPrjMatch.run(); if (matchIncrement.getSleepTime() > 0) { logger.info(".........sleeping.........." + matchIncrement.getSleepTime()/1000 + "s......"); } diff --git a/match_program/src/main/java/com/ossean/match/matchprocess/CountFrequency.java b/match_program/src/main/java/com/ossean/match/matchprocess/CountFrequency.java deleted file mode 100644 index 683347f..0000000 --- a/match_program/src/main/java/com/ossean/match/matchprocess/CountFrequency.java +++ /dev/null @@ -1,364 +0,0 @@ -package com.ossean.match.matchprocess; - -import java.io.IOException; -import java.nio.file.Paths; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Set; - -import javax.annotation.Resource; - -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.index.DirectoryReader; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.queryparser.classic.ParseException; -import org.apache.lucene.queryparser.classic.QueryParser; -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.Query; -import org.apache.lucene.search.ScoreDoc; -import org.apache.lucene.search.TopDocs; -import org.apache.lucene.store.Directory; -import org.apache.lucene.store.FSDirectory; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.springframework.stereotype.Component; -import org.wltea.analyzer.lucene.IKAnalyzer; - -import com.ossean.match.dao.AtomicItemDao; -import com.ossean.match.dao.ProjectDao; -import com.ossean.match.dao.TagDao; -import com.ossean.match.lucene.LuceneIndex; -import com.ossean.match.model.Atom; -import com.ossean.match.model.Project; -import com.ossean.match.model.Tag; -import com.ossean.match.pipeline.AtomicItemPipeline; -import com.ossean.match.pipeline.ResultPipeline; -import com.ossean.match.utils.Extractor; -import com.ossean.match.utils.Normalizer; -import com.ossean.match.utils.SimilarityCounter; - -@Component("countfrequency") -public class CountFrequency { - - @Resource - private AtomicItemPipeline atomicItemPipeline; - @Resource - private AtomicItemDao atomicItemDao; - @Resource - private ProjectDao projectDao; - @Resource - private TagDao tagDao; - @Resource - private ResultPipeline resultPipeline; - - public static List atomList; - // = atomicItemDao.selectAllAtom(); - - public static Map> atoms; - // = transformAtoms(atomList); - - public static String tagIndexDir = "tagIndexDir"; - - private Logger logger = LoggerFactory.getLogger(getClass()); - MatchIncrement matchIncrement = null; - - public void setMatchIncrement(MatchIncrement matchIncrement) { - this.matchIncrement = matchIncrement; - } - - private static Map> transformAtoms( - List atomList) { - // TODO Auto-generated method stub - ArrayList nums = null; - Map> atoms = new HashMap>(); - String name = null; - for (Atom atom : atomList) { - nums = new ArrayList(2); - nums.add(atom.getNumTag()); - nums.add(atom.getNumInProj()); - nums.add(atom.getStatus()); - name = atom.getName(); - atoms.put(name, nums); - } - return atoms; - } - - private List retransformAtoms(Map> atoms) { - List atomList = new ArrayList(); - Set items = atoms.keySet(); - for (String item : items) { - Atom atom = new Atom(); - atom.setName(item); - atom.setNumInTag(atoms.get(item).get(0)); - atom.setNumInProj(atoms.get(item).get(1)); - atom.setStatus(atoms.get(item).get(2)); - atomList.add(atom); - } - return atomList; - } - - private void extractTags(List tags, - Map> atoms) { - for (Tag tag : tags) { - List items = Extractor.extractAtoms(tag.getName()); - logger.info("extract tag : " + tag.getId() + "," - + tag.getName() + ">>" + items); - saveAtoms(items, atoms, 0); - - } - } - - private void extractProjects(List projects, - Map> atoms) { - for (Project project : projects) { - List items = Extractor.extractAtoms(project.getName()); - logger.info("extract project : " + project.getId() + "," - + project.getName() + ">>" + items); - saveAtoms(items, atoms, 1); - } - } - - /** - * @param items 抽取元素 - * @param atoms 存储抽取到的元素 - * @param pos 用来标注项目和标签,1为项目,0为标签 - */ - private void saveAtoms(List items, - Map> atoms, int pos) { - - ArrayList nums = null; - for (String item : items) { - if (atoms.containsKey(item)) { - nums = atoms.get(item); - if(nums.get(2) == 0) { - nums.set(2, 1); - } - } else { - nums = new ArrayList(2); - nums.add(0); - nums.add(0); - nums.add(2); - } - nums.set(pos, nums.get(pos) + 1); - atoms.put(item, nums); - } - } - - // 对项目(projName)和标签进行匹配,并将匹配结果存入数据库 - private List queryMatch(String projName, int projId, Map> atoms) - throws ParseException { - Directory dir; - Analyzer analyzer = new IKAnalyzer(true); - List resultStrings = new ArrayList(); - try { - dir = FSDirectory.open(Paths.get(CountFrequency.tagIndexDir)); - IndexReader reader = DirectoryReader.open(dir); - IndexSearcher searcher = new IndexSearcher(reader); - QueryParser parser = new QueryParser("items", analyzer); - Query query = null; - if (projName != null && !"".equals(Normalizer.normalize(projName))) { - query = parser.parse(QueryParser.escape(Normalizer - .normalize(projName))); - - TopDocs hits = searcher.search(query, 1000); - if (hits.scoreDocs != null) { - for (ScoreDoc sc : hits.scoreDocs) { - int docNum = sc.doc; - int tagId = Integer.parseInt(searcher.doc(docNum).get( - "id")); - String tagName = searcher.doc(docNum).get("name"); - // System.out.println("match: " + projId + " - " + tagId - // + - // "(" - // + tagName + ")"); - // System.out.println("completely matched tag >>> id: " - // + tagId + ",name: " + tagOriginalName); - List itemsOfPrj = Extractor - .extractAtoms(projName); - List itemsOfTag = Extractor - .extractAtoms(tagName); - - int prjWeight[] = new int[itemsOfPrj.size()]; - int tagWeight[] = new int[itemsOfTag.size()]; - - int i = 0; - for (String item : itemsOfPrj) { - prjWeight[i++] = atoms.get(item).get(1); - } - i = 0; - for (String item : itemsOfTag) { - tagWeight[i++] = atoms.get(item).get(0); - } - - float score = SimilarityCounter.countSimilarity( - itemsOfPrj, prjWeight, itemsOfTag, tagWeight); - float EPSINON = 0.999F; - if (!((score >= -EPSINON) && (score <= EPSINON))) { - resultPipeline.insertResult3(projId, projName, - tagId, tagName, score); - resultStrings.add(tagName); - } - } - } - } - - } catch (IOException e) { - e.printStackTrace(); - } - return resultStrings; - } - - // 对项目(projName)和新标签进行匹配,并将匹配结果存入数据库 - private List queryMatchNewTag(String tagName, int tagId, Map> atoms) - throws ParseException { - // Directory dir; - tagName = tagName.trim(); - Analyzer analyzer = new IKAnalyzer(true); - List resultStrings = new ArrayList(); - try { - Directory dire = FSDirectory.open(Paths - .get(LuceneIndex.PRJS_INDEX_PATH)); - IndexReader indexReader = DirectoryReader.open(dire); - - // dir = FSDirectory.open(Paths.get(CountFrequency.tagIndexDir)); - // IndexReader reader = DirectoryReader.open(dir); - IndexSearcher searcher = new IndexSearcher(indexReader); - QueryParser parser = new QueryParser("items", analyzer); - Query query = null; - if (tagName != null && !"".equals(Normalizer.normalize(tagName))) { - query = parser.parse(QueryParser.escape(Normalizer - .normalize(tagName))); - TopDocs hits = searcher.search(query, 1000); - - if (hits.scoreDocs != null) { - for (ScoreDoc sc : hits.scoreDocs) { - int docNum = sc.doc; - int projId = Integer.parseInt(searcher.doc(docNum).get( - LuceneIndex.prjIdFieldName)); - String projName = searcher.doc(docNum).get( - LuceneIndex.prjNameFieldName); - // System.out.println("match: " + projId + " - " + tagId - // + - // "(" - // + tagName + ")"); - // System.out.println("completely matched tag >>> id: " - // + tagId + ",name: " + tagOriginalName); - List itemsOfPrj = Extractor - .extractAtoms(projName); - List itemsOfTag = Extractor - .extractAtoms(tagName); - - int prjWeight[] = new int[itemsOfPrj.size()]; - int tagWeight[] = new int[itemsOfTag.size()]; - - int i = 0; - for (String item : itemsOfPrj) { - prjWeight[i++] = atoms.get(item).get(1); - } - i = 0; - for (String item : itemsOfTag) { - tagWeight[i++] = atoms.get(item).get(0); - } - - float score = SimilarityCounter.countSimilarity( - itemsOfPrj, prjWeight, itemsOfTag, tagWeight); - float EPSINON = 0.999F; - if (!((score >= -EPSINON) && (score <= EPSINON))) { - // TODO pipeline - resultPipeline.insertResult3(projId, projName, - tagId, tagName, score); - resultStrings.add(projName); - } - } - } - } - - } catch (IOException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } - return resultStrings; - } - - public void run() throws ParseException { - atomList = atomicItemDao.selectAllAtom(); - atoms = transformAtoms(atomList); - if (matchIncrement.getPrePrjHistory() == 0 - && matchIncrement.getTagHistory() == 0) { - List tags = tagDao.getBatchTag(matchIncrement.getTagHistory(), - matchIncrement.getLastTagId()); - // 建立索引 - LuceneIndex.buildTagIndex(tags); - - List projects = projectDao.getBatchPrjsIncre(2000000); - - // 抽取项目 - extractProjects(projects, atoms); - matchIncrement.getMatchRecord(); - matchIncrement.setPrePrjHistory(matchIncrement.getLastPrjId()); - - // 抽取标签 - extractTags(tags, atoms); - matchIncrement.setTagHistory(matchIncrement.getLastTagId()); - -// atomList.clear(); -// atomList = retransformAtoms(atoms); -// atomicItemPipeline.pipelineAtoms(atomList); - - // 新项目与标签匹配 - for (Project project : projects) { - queryMatch(project.getName().trim(), - project.getId(), atoms); - } - matchIncrement.writeIntoRecord(); - } else { - if (matchIncrement.isNewPrePrjFlag()) { - logger.info("start count frequency of projects match!!!"); - List projects = projectDao.getBatchPrjs( - matchIncrement.getPrjHistory(), - matchIncrement.getLastPrjId()); - - // 抽取项目 - extractProjects(projects, atoms); - matchIncrement.setPrePrjHistory(matchIncrement - .getLastPrjId()); - - // 新项目与标签匹配 - for (Project project : projects) { - List tags = new ArrayList(); - tags = queryMatch(project.getName().trim(), - project.getId(),atoms); - - } - matchIncrement.writeIntoRecord(); - } - if (matchIncrement.isNewTagflag()) { - logger.info("start count frequency of projects match!!!"); - - List tags = tagDao.getBatchTag( - matchIncrement.getTagHistory(), - matchIncrement.getLastTagId()); - // 建立索引 - LuceneIndex.buildTagIndex(tags); - // 抽取项目 - extractTags(tags, atoms); - matchIncrement.setTagHistory(matchIncrement.getLastTagId()); - - // 新标签与项目匹配 - for (Tag tag : tags) { - List projects = new ArrayList(); - projects = queryMatchNewTag(tag.getName().trim(), - tag.getId(), atoms); - - } - matchIncrement.writeIntoRecord(); - } - } - atomList.clear(); - atomList = retransformAtoms(atoms); - atomicItemPipeline.pipelineAtoms(atomList); - } - -} diff --git a/match_program/src/main/java/com/ossean/match/matchprocess/Match.java b/match_program/src/main/java/com/ossean/match/matchprocess/Match.java index 81daf4e..e229c98 100644 --- a/match_program/src/main/java/com/ossean/match/matchprocess/Match.java +++ b/match_program/src/main/java/com/ossean/match/matchprocess/Match.java @@ -5,7 +5,6 @@ import java.nio.file.Paths; import java.util.HashMap; import java.util.List; import java.util.Map; -import java.util.Set; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; @@ -24,7 +23,6 @@ import com.ossean.match.lucene.LuceneSearch; import com.ossean.match.model.Project; import com.ossean.match.model.RelativeMemo; import com.ossean.match.model.RelativeMemoToOpenSourceProject; -import com.ossean.match.pipeline.ResultPipeline; import com.ossean.match.utils.Normalizer; import javax.annotation.Resource; @@ -38,8 +36,6 @@ public class Match { private double titleWeight = 0.8; private double threshholdWeight = 1.3; - @Resource - private ResultPipeline resultPipeline; @Resource private RelativeMemoToOpenSourceProjectDao matchResultDao; @Resource @@ -59,14 +55,11 @@ public class Match { try { Directory memoDire = FSDirectory.open(Paths .get(LuceneIndex.MEMOS_INDEX_PATH)); - Directory prjDire = FSDirectory.open(Paths - .get(LuceneIndex.PRJS_INDEX_PATH)); if (!DirectoryReader.indexExists(memoDire)) { // if the index directory // doesn't exist return; } IndexReader memoIndexReader = DirectoryReader.open(memoDire); - IndexReader prjIndexReader = DirectoryReader.open(prjDire); for(Project project : projects) { int prjId = 0; String prjName = ""; @@ -78,11 +71,14 @@ public class Match { if(project.getTags() != null) prjTags = project.getTags().toLowerCase(); prjName = project.getName().toLowerCase(); - //long start1 = System.currentTimeMillis(); - // 项目名匹配帖子标题 if (prjName.length() >= 2 && !Normalizer.isAllNumber(prjName)) { + // 项目名匹配帖子标题 matchMap = LuceneSearch.prjToMemoMatchByLucene(prjName, - LuceneIndex.titleFieldName, titleWeight, matchMap, memoIndexReader, prjIndexReader); + LuceneIndex.titleFieldName, titleWeight, matchMap, memoIndexReader); + // 项目名匹配帖子标签 + matchMap = LuceneSearch.searchMemoTags( + prjName, LuceneIndex.memoTagsFieldName, LuceneIndex.memoIdFieldName, memoTagsWeight, + matchMap, memoIndexReader); } if (project.getSynonyms() != null) { synonyms = project.getSynonyms().toLowerCase(); @@ -91,88 +87,39 @@ public class Match { //项目别名匹配帖子标题 TODO for (String synonym : synonymsList) { matchMap = LuceneSearch.prjToMemoMatchByLucene(synonym, - LuceneIndex.titleFieldName, titleWeight, matchMap, memoIndexReader, prjIndexReader); + LuceneIndex.titleFieldName, titleWeight, matchMap, memoIndexReader); } //项目别名匹配帖子标签 TODO for (String synonym : synonymsList) { matchMap = LuceneSearch.searchMemoTags( - synonym, LuceneIndex.memoTagsFieldName, memoTagsWeight, + synonym, LuceneIndex.memoTagsFieldName, LuceneIndex.memoIdFieldName, memoTagsWeight, matchMap, memoIndexReader); } } } - //long end1 = System.currentTimeMillis(); - //logger.info("项目名匹配帖子标题: " + (end1 - start1) + "ms"); - // 项目名匹配帖子标签 TODO :词频统计可直接从索引获得 - if (prjName.length() >= 2 && !Normalizer.isAllNumber(prjName)) { - Set tagStrings = resultPipeline.getTagbyPrjId(prjId); // 这里要返回标签匹配到的项目id; - for (String tag : tagStrings) { - matchMap = LuceneSearch.searchMemoTags( - tag.toLowerCase(), LuceneIndex.memoTagsFieldName, memoTagsWeight, - matchMap, memoIndexReader); - } - } - //long end2 = System.currentTimeMillis(); - //logger.info("项目名匹配帖子标签: " + (end2 - end1) + "ms"); - // 用项目标签搜索帖子标题 - //HashMap tagsCount = null; if (prjTags != null && prjTags.length() > 0 && matchMap.size() > 0) { List prjTagsList = Normalizer.tagsSegmentation(prjTags); + // 用项目标签搜索帖子标题 for (String prjTag : prjTagsList) { matchMap = LuceneSearch.searchByPrjTagInMemoTitle( prjTag, LuceneIndex.titleFieldName, LuceneIndex.memoIdFieldName, prjTagsToMemoTitleWeight, matchMap, memoIndexReader); } - /*tagsCount = LuceneSearch.tagsMatch( - LuceneIndex.memoIdFieldName, prjTags, - LuceneIndex.titleFieldName, indexReader); - // tagsCount是,指每个post的标题匹配到的tag数量 - if (tagsCount.size() > 0) { - for (Map.Entry t : tagsCount.entrySet()) { - int postId = t.getKey(); - int x = t.getValue(); - if (matchMap.containsKey(postId)) { - matchMap.put(postId, matchMap.get(postId) + matchMap.get(postId) * 0.5 * (Math.log(x * x + 1) / Math.log(2))); - } - } - }*/ - //long end3 = System.currentTimeMillis(); - //logger.info("用项目标签搜索帖子标题: " + (end3 - end2) + "ms"); // 用项目标签搜索帖子标签 for (String prjTag : prjTagsList) { matchMap = LuceneSearch.searchByPrjTag( prjTag, LuceneIndex.memoTagsFieldName, LuceneIndex.memoIdFieldName, prjTagsToMemoTagsWeight, matchMap, memoIndexReader); } - /*tagsCount = LuceneSearch.tagsMatch( - LuceneIndex.memoIdFieldName, prjTags, - LuceneIndex.memoTagsFieldName, indexReader); - if (tagsCount.size() > 0) { - for (Map.Entry t : tagsCount - .entrySet()) { - int postId = t.getKey(); - int y = t.getValue(); - if (matchMap.containsKey(postId)) { - matchMap.put(postId, matchMap.get(postId) + matchMap.get(postId) * 0.6 * (Math.log(y * y + 1) / Math.log(2))); - } - } - }*/ - //long end4 = System.currentTimeMillis(); - //logger.info("用项目标签搜索帖子标签: " + (end4 - end3) + "ms"); } - //long end5 = System.currentTimeMillis(); if (matchMap.size() > 0) insertPrjToMemoMatchResult(prjId, matchMap); matchMap.clear(); - //long end6 = System.currentTimeMillis(); - //logger.info("当前项目匹配结果入库: " + (end6 - end5) + "ms"); projectDao.updateProcessedPrj(prjId); logger.info("current prjId: " + prjId); } memoIndexReader.close(); memoDire.close(); - prjIndexReader.close(); - prjDire.close(); } catch (IOException e) { logger.error("prjToMemoMatch io error in Match: " + e); } @@ -263,13 +210,13 @@ public class Match { public void memoToPrjMatch(List memos) { HashMap matchMap = new HashMap(); try { - Directory dire = FSDirectory.open(Paths + Directory prjDire = FSDirectory.open(Paths .get(LuceneIndex.PRJS_INDEX_PATH)); - if (!DirectoryReader.indexExists(dire)) { // if the index directory + if (!DirectoryReader.indexExists(prjDire)) { // if the index directory // doesn't exist return; } - IndexReader indexReader = DirectoryReader.open(dire); + IndexReader prjIndexReader = DirectoryReader.open(prjDire); for(RelativeMemo memo : memos) { int memoId = 0; String memoTitle = ""; @@ -288,88 +235,48 @@ public class Match { if (Normalizer.isAllNumber(memoTitle)) { continue; } - //long start1 = System.currentTimeMillis(); List memoTitleList = Normalizer.getList(memoTitle); if (memoTitleList.size() > 0) { // 帖子标题匹配项目名 matchMap = LuceneSearch.memoToPrjMatchByLucene(memoTitle, memoTitleList, LuceneIndex.prjNameFieldName, - titleWeight, matchMap, indexReader); + titleWeight, matchMap, prjIndexReader); //帖子标题搜索项目别名 matchMap = LuceneSearch.memoToPrjMatchByLucene(memoTitle, memoTitleList, LuceneIndex.prjSynonymsFieldName, - titleWeight, matchMap, indexReader); + titleWeight, matchMap, prjIndexReader); } - //long end1 = System.currentTimeMillis(); - //logger.info("帖子标题匹配项目名: " + (end1 - start1) + "ms"); + List memoTagsList = Normalizer.tagsSegmentation(memo.getTags().toLowerCase()); // 用帖子标签搜索项目名 - if (memoTags.length() > 0) { - Set projectIds = resultPipeline.getPrjIdForMemo(memoId); // 这里要返回标签匹配到的项目id; - for (int pId : projectIds) { - if (matchMap.containsKey(pId)){ - matchMap.put(pId, matchMap.get(pId) + memoTagsWeight); - } - else { - matchMap.put(pId, memoTagsWeight); - } + if (memoTags.length() > 0 && memoTagsList.size() > 0) { + //用帖子标签搜索项目名 + for (String memoTag : memoTagsList) { + matchMap = LuceneSearch.searchMemoTags(memoTag, + LuceneIndex.prjNameFieldName, LuceneIndex.prjIdFieldName, memoTagsWeight, matchMap, prjIndexReader); + } + //用帖子标签搜索项目别名 + for (String tag : memoTagsList) { + matchMap = LuceneSearch.searchMemoTags(tag, + LuceneIndex.prjSynonymsFieldName, LuceneIndex.prjIdFieldName, memoTagsWeight, matchMap, prjIndexReader); } } - //long end2 = System.currentTimeMillis(); - //logger.info("帖子标签搜索项目名: " + (end2 - end1) + "ms"); if (matchMap.size() > 0) { // 帖子标题匹配项目标签 if (memoTitleList != null && memoTitleList.size() > 0) { for (String memoTitleTerm : memoTitleList) { matchMap = LuceneSearch.searchByPrjTag( memoTitleTerm, LuceneIndex.prjTagsFieldName, LuceneIndex.prjIdFieldName, prjTagsToMemoTitleWeight, - matchMap, indexReader); + matchMap, prjIndexReader); } } - //long end3 = System.currentTimeMillis(); - //logger.info("帖子标题匹配项目标签: " + (end3 - end2) + "ms"); - /*HashMap tagsCount = null; - tagsCount = LuceneSearch.tagsMatch( - LuceneIndex.prjIdFieldName, memoTitle, - LuceneIndex.prjTagsFieldName, indexReader); - // tagsCount是,x指匹配到prjTag的数量 - if (tagsCount.size() > 0) { - for (Map.Entry t : tagsCount - .entrySet()) { - int prjId = t.getKey(); - int x = t.getValue(); - if (matchMap.containsKey(prjId)) { - matchMap.put(prjId, matchMap.get(prjId) + matchMap.get(prjId) * 0.5 - * (Math.log(x * x + 1) / Math.log(2))); - } - } - } */ - // 用帖子标签搜索项目标签 if (memo.getTags() != null && memo.getTags().length() > 0) { - List memoTagsList = Normalizer.tagsSegmentation(memo.getTags().toLowerCase()); for (String memoTag : memoTagsList) { matchMap = LuceneSearch.searchByPrjTag( memoTag, LuceneIndex.prjTagsFieldName, LuceneIndex.prjIdFieldName, prjTagsToMemoTagsWeight, - matchMap, indexReader); + matchMap, prjIndexReader); } } - //long end4 = System.currentTimeMillis(); - //logger.info("帖子标签搜索项目标签: " + (end4 - end3) + "ms"); - /*if (memoTags.length() > 0) { - tagsCount = LuceneSearch.tagsMatch( - LuceneIndex.prjIdFieldName, memoTags, - LuceneIndex.prjTagsFieldName, indexReader); - if (tagsCount.size() > 0) { - for (Map.Entry t : tagsCount.entrySet()) { - int prjId = t.getKey(); - int y = t.getValue(); - if (matchMap.containsKey(prjId)) { - matchMap.put(prjId, matchMap.get(prjId) + matchMap.get(prjId) * 0.6 - * (Math.log(y * y + 1) / Math.log(2))); - } - } - } - }*/ if (matchMap.size() > 0) insertMemoToPrjMatchResult(memoId, matchMap); } @@ -377,8 +284,8 @@ public class Match { matchMap.clear(); logger.info("current MemoId: " + memoId); } - indexReader.close(); - dire.close(); + prjIndexReader.close(); + prjDire.close(); } catch (Exception e) { logger.error("memoToPrjMatch error in Match: " + e); } diff --git a/match_program/src/main/java/com/ossean/match/matchprocess/MatchIncrement.java b/match_program/src/main/java/com/ossean/match/matchprocess/MatchIncrement.java index f4b3a6b..c12e897 100644 --- a/match_program/src/main/java/com/ossean/match/matchprocess/MatchIncrement.java +++ b/match_program/src/main/java/com/ossean/match/matchprocess/MatchIncrement.java @@ -11,21 +11,11 @@ import org.slf4j.LoggerFactory; public class MatchIncrement extends Thread implements Runnable { private Logger logger = LoggerFactory.getLogger(getClass()); - private int prjHistory; // the record of project after matching last time private int memoHistory; // the record of memo after matching last time - private int prjIndexed; //the record of prj indexed to - private int memoIndexed; //the record of memo indexed to private static File record; // record file "record.txt" private boolean newPrjComing; //新项目进入标识 private boolean newMemoComing; //新帖子进入标识 private boolean currentMatchDone = true; //当前批量匹配是否结束 - private int lastPrjId = 0; //最大项目id - private int lastTagId = 0; //最大标签id - private int tagHistory = 0; //标签匹配记录 - private boolean newTagflag; //新标签进入标识 - private boolean newPrePrjFlag; //新的项目需要做标签匹配 - private int prePrjHistory = 0; //项目与标签匹配记录 - private int taskToPrjId = 0; private int sleepTime = 0; public boolean isNewPrjComing() { @@ -52,91 +42,11 @@ public class MatchIncrement extends Thread implements Runnable { this.sleepTime = sleepTime; } - public int getTaskToPrjId() { - return taskToPrjId; - } - - public void setTaskToPrjId(int taskToPrjId) { - this.taskToPrjId = taskToPrjId; - } - - public int getMemoIndexed() { - return memoIndexed; - } - - public void setMemoIndexed(int memoIndexed) { - this.memoIndexed = memoIndexed; - } - - public int getPrjIndexed() { - return prjIndexed; - } - - public void setPrjIndexed(int prjIndexed) { - this.prjIndexed = prjIndexed; - } - - public boolean isNewPrePrjFlag() { - return newPrePrjFlag; - } - - public void setNewPrePrjFlag(boolean newPrePrjFlag) { - this.newPrePrjFlag = newPrePrjFlag; - } - - public int getPrePrjHistory() { - return prePrjHistory; - } - - public void setPrePrjHistory(int prePrjHistory) { - this.prePrjHistory = prePrjHistory; - } - - public int getTagHistory() { - return tagHistory; - } - - public void setTagHistory(int tagHistory) { - this.tagHistory = tagHistory; - } - - public boolean isNewTagflag() { - return newTagflag; - } - - public void setNewTagflag(boolean newTagflag) { - this.newTagflag = newTagflag; - } - - public void setLastPrjId(int id) { - this.lastPrjId = id; - } - - public int getLastPrjId() { - return lastPrjId; - } - - public int getLastTagId() { - return lastTagId; - } - - public void setLastTagId(int id) { - this.lastTagId = id; - } - public MatchIncrement() { record = new File("record.txt"); getMatchRecord(); } - public int getPrjHistory() { - return prjHistory; - } - - public synchronized void setPrjHistory(int prjHistory) { - this.prjHistory = prjHistory; - } - public int getMemoHistory() { return memoHistory; } @@ -160,13 +70,7 @@ public class MatchIncrement extends Thread implements Runnable { public void getMatchRecord() { try { Scanner in = new Scanner(record); - this.setPrjHistory(in.nextInt()); this.setMemoHistory(in.nextInt()); - this.setTagHistory(in.nextInt()); - this.setPrePrjHistory(in.nextInt()); - this.setPrjIndexed(in.nextInt()); - this.setMemoIndexed(in.nextInt()); - this.setTaskToPrjId(in.nextInt()); in.close(); } catch (FileNotFoundException e) { logger.error("getMatchRecord FileNotFoundException: " + e); @@ -179,8 +83,7 @@ public class MatchIncrement extends Thread implements Runnable { public void writeIntoRecord() { try { FileOutputStream fout = new FileOutputStream(record); - fout.write((prjHistory + "\t" + memoHistory + "\t" + tagHistory - + "\t" + prePrjHistory + "\t" + prjIndexed + "\t" + memoIndexed + "\t" + taskToPrjId).getBytes()); + fout.write((memoHistory + "").getBytes()); fout.close(); } catch (FileNotFoundException e) { logger.error("writeIntoRecord FileNotFoundException: " + e); diff --git a/match_program/src/main/java/com/ossean/match/matchprocess/MemoToPrjMatch.java b/match_program/src/main/java/com/ossean/match/matchprocess/MemoToPrjMatch.java index f401aea..0fb8c70 100644 --- a/match_program/src/main/java/com/ossean/match/matchprocess/MemoToPrjMatch.java +++ b/match_program/src/main/java/com/ossean/match/matchprocess/MemoToPrjMatch.java @@ -2,7 +2,6 @@ package com.ossean.match.matchprocess; import java.io.IOException; import java.sql.SQLException; -import java.text.DecimalFormat; import java.util.List; import javax.annotation.Resource; @@ -41,7 +40,6 @@ public class MemoToPrjMatch implements Runnable { public void memoToPrjIncrement() throws SQLException, IOException { matchIncrement.setCurrentMatchDone(false); - long start = System.currentTimeMillis(); matchIncrement.getMatchRecord(); int startId = matchIncrement.getMemoHistory(); int endId = startId + step; @@ -49,25 +47,15 @@ public class MemoToPrjMatch implements Runnable { if ((lastMemoId - endId) <= step) { endId = lastMemoId; } + LuceneIndex luceneIndex = new LuceneIndex(); while (startId < lastMemoId) { - long batchStart = System.currentTimeMillis(); List memos = memoDao.getMemoInfo(startId, endId); if (memos != null && memos.size() > 0) { match.memoToPrjMatch(memos); matchIncrement.setMemoHistory(endId); - if (startId > matchIncrement.getMemoIndexed()) { - LuceneIndex luceneIndex = new LuceneIndex(); - int memoIndexed = luceneIndex.memoIndexIncrement(memos); - // adding memos index and save the record - matchIncrement.setMemoIndexed(memoIndexed); - } + // adding memos index and save the record + luceneIndex.memoIndexIncrement(memos); } - long batchEnd = System.currentTimeMillis(); - DecimalFormat df = new DecimalFormat( "0.00"); - double totalTime = (double)(batchEnd - batchStart)/60000; - double rate = (double)(endId - startId)/((batchEnd - batchStart)/1000); - logger.info("current Batch memo-->prj total time: " + df.format(totalTime) + "min; " - + " average: " + df.format(rate) + " 条/s"); logger.info("relative_memos_to_osps matched to : " + matchIncrement.getMemoHistory()); logger.info("relative_memos_to_osps remain_to_match : " @@ -83,13 +71,8 @@ public class MemoToPrjMatch implements Runnable { endId += step; } } - long end = System.currentTimeMillis(); - DecimalFormat df = new DecimalFormat( "0.00"); - double totalTime = (double)(end - start)/60000; - logger.info("memo-->prj total time: " + df.format(totalTime) + "min"); matchIncrement.setCurrentMatchDone(true); } - public void run() { if (matchIncrement.isNewMemoComing()) { logger.info("start relative_memos_to_osps match!!!"); diff --git a/match_program/src/main/java/com/ossean/match/matchprocess/NewPrjMonitor.java b/match_program/src/main/java/com/ossean/match/matchprocess/NewPrjMonitor.java index 6bb1f80..fa93912 100644 --- a/match_program/src/main/java/com/ossean/match/matchprocess/NewPrjMonitor.java +++ b/match_program/src/main/java/com/ossean/match/matchprocess/NewPrjMonitor.java @@ -26,13 +26,9 @@ public class NewPrjMonitor { public void run() { int newPrjCount = projectDao.getNewPrjCount(); - int lastPrjId = projectDao.getNewLast(); int lastMemoId = memoDao.getLastMemoId(); - int lastTagId = tagDao.getNewLast(); if (newPrjCount > 0) { matchIncrement.setNewPrjComing(true); - matchIncrement.setLastPrjId(lastPrjId); - } else { matchIncrement.setNewPrjComing(false); } @@ -42,19 +38,6 @@ public class NewPrjMonitor { matchIncrement.setNewMemoComing(false); } - if (newPrjCount > 0) { - matchIncrement.setNewPrePrjFlag(true); - } else { - matchIncrement.setNewPrePrjFlag(false); - } - - if (lastTagId > matchIncrement.getTagHistory()) { - matchIncrement.setNewTagflag(true); - matchIncrement.setLastTagId(lastTagId); - - } else { - matchIncrement.setNewTagflag(false); - } if (!matchIncrement.isNewMemoComing()&&!matchIncrement.isNewPrjComing()) { matchIncrement.setSleepTime(30000); }else { diff --git a/match_program/src/main/java/com/ossean/match/matchprocess/PrjToMemoMatch.java b/match_program/src/main/java/com/ossean/match/matchprocess/PrjToMemoMatch.java index 9249750..faae10e 100644 --- a/match_program/src/main/java/com/ossean/match/matchprocess/PrjToMemoMatch.java +++ b/match_program/src/main/java/com/ossean/match/matchprocess/PrjToMemoMatch.java @@ -2,7 +2,6 @@ package com.ossean.match.matchprocess; import java.io.IOException; import java.sql.SQLException; -import java.text.DecimalFormat; import java.util.List; import javax.annotation.Resource; @@ -34,73 +33,31 @@ public class PrjToMemoMatch implements Runnable { public void prjToMemoIncrement() throws IOException, SQLException { matchIncrement.setCurrentMatchDone(false); - long start = System.currentTimeMillis(); matchIncrement.getMatchRecord(); - List newPrjs = projectDao.getBatchPrjsIncre(step);// 批量读取filtration为1或者filtration为2且update_mark不为空的项目。即未匹配的和已匹配但有更新的。 - if(newPrjs == null || newPrjs.size() == 0){ - matchIncrement.setPrjHistory(matchIncrement.getLastPrjId()); - matchIncrement.writeIntoRecord(); - } + // 批量读取filtration为1或者filtration为2且update_mark不为空的项目。即未匹配的和已匹配但有更新的。 + List newPrjs = projectDao.getBatchPrjsIncre(step); while (newPrjs != null && newPrjs.size() > 0) { - long batchStart = System.currentTimeMillis(); - long end1 = System.currentTimeMillis(); - logger.info("批量读项目: " + (end1 - batchStart) + "ms"); if (newPrjs != null) { match.prjToMemoMatch(newPrjs); - matchIncrement.setPrjHistory(newPrjs.get(newPrjs.size()-1).getId()); - // adding projects index and save the record + // adding projects index LuceneIndex luceneIndex = new LuceneIndex(); - int prjIndexed = luceneIndex.prjIndexIncrement(newPrjs); - matchIncrement.setPrjIndexed(prjIndexed); + luceneIndex.prjIndexIncrement(newPrjs); } - long batchEnd = System.currentTimeMillis(); - DecimalFormat df = new DecimalFormat( "0.00"); - double totalTime = (double)(batchEnd - batchStart)/60000; - double rate = (double)(newPrjs.size())/((batchEnd - batchStart)/1000); - logger.info("current Batch prj-->memo total time: " + df.format(totalTime) + "min; " - + " average: " + df.format(rate) + " 条/s"); - // finishing matching, writing the record to "record.txt" - matchIncrement.writeIntoRecord(); - newPrjs = projectDao.getBatchPrjsIncre(step);// getting projects' id and prjName in batches + // getting projects' id and prjName in batches + newPrjs = projectDao.getBatchPrjsIncre(step); } - long end = System.currentTimeMillis(); - DecimalFormat df = new DecimalFormat( "0.00"); - double totalTime = (double)(end - start)/60000; - logger.info("prj-->memo total time: " + df.format(totalTime) + "min"); matchIncrement.setCurrentMatchDone(true); } public void run() { - if (matchIncrement.getTaskToPrjId() != 0) { - int startId = matchIncrement.getPrjHistory(); - int endId = matchIncrement.getTaskToPrjId(); - long batchStart = System.currentTimeMillis(); - List newPrjs = projectDao.getBatchPrjs(startId, endId);// getting projects' id and prjName in batches - long end1 = System.currentTimeMillis(); - logger.info("批量读项目: " + (end1 - batchStart) + "ms"); - if (newPrjs != null) { - match.prjToMemoMatch(newPrjs); - } - long batchEnd = System.currentTimeMillis(); - DecimalFormat df = new DecimalFormat( "0.00"); - double totalTime = (double)(batchEnd - batchStart)/60000; - double rate = (double)(endId - startId)/((batchEnd - batchStart)/1000); - logger.info("current Batch prj-->memo total time: " + df.format(totalTime) + "min; " - + " average: " + df.format(rate) + " 条/s"); - logger.info("osps_to_relative_memos matched to : " - + matchIncrement.getPrjHistory()); - System.exit(0); - } - else { - if (matchIncrement.isNewPrjComing()) { - logger.info("start osps_to_relative_memos match!!!"); - try { - prjToMemoIncrement(); - } catch (IOException e) { - logger.error("IOException: " + e); - } catch (SQLException e) { - logger.error("SQLException: " + e); - } + if (matchIncrement.isNewPrjComing()) { + logger.info("start osps_to_relative_memos match!!!"); + try { + prjToMemoIncrement(); + } catch (IOException e) { + logger.error("IOException: " + e); + } catch (SQLException e) { + logger.error("SQLException: " + e); } } diff --git a/match_program/src/main/java/com/ossean/match/model/Taggings.java b/match_program/src/main/java/com/ossean/match/model/Taggings.java deleted file mode 100644 index 583ed1d..0000000 --- a/match_program/src/main/java/com/ossean/match/model/Taggings.java +++ /dev/null @@ -1,68 +0,0 @@ -package com.ossean.match.model; - -public class Taggings { - private int id; - private int tag_id; - private int taggable_id; - private String taggable_type; - private int tagger_id; - private String tagger_type; - private String context; - private String created_at; - private int disagree_num; - public int getId() { - return id; - } - public void setId(int id) { - this.id = id; - } - public int getTag_id() { - return tag_id; - } - public void setTag_id(int tag_id) { - this.tag_id = tag_id; - } - public int getTaggable_id() { - return taggable_id; - } - public void setTaggable_id(int taggable_id) { - this.taggable_id = taggable_id; - } - public String getTaggable_type() { - return taggable_type; - } - public void setTaggable_type(String taggable_type) { - this.taggable_type = taggable_type; - } - public int getTagger_id() { - return tagger_id; - } - public void setTagger_id(int tagger_id) { - this.tagger_id = tagger_id; - } - public String getTagger_type() { - return tagger_type; - } - public void setTagger_type(String tagger_type) { - this.tagger_type = tagger_type; - } - public String getContext() { - return context; - } - public void setContext(String context) { - this.context = context; - } - public String getCreated_at() { - return created_at; - } - public void setCreated_at(String created_at) { - this.created_at = created_at; - } - public int getDisagree_num() { - return disagree_num; - } - public void setDisagree_num(int disagree_num) { - this.disagree_num = disagree_num; - } - -} diff --git a/match_program/src/main/java/com/ossean/match/pipeline/AtomicItemPipeline.java b/match_program/src/main/java/com/ossean/match/pipeline/AtomicItemPipeline.java deleted file mode 100644 index b17cddf..0000000 --- a/match_program/src/main/java/com/ossean/match/pipeline/AtomicItemPipeline.java +++ /dev/null @@ -1,57 +0,0 @@ -package com.ossean.match.pipeline; - -import java.util.ArrayList; -import java.util.List; -import java.util.Map; -import java.util.Set; - -import javax.annotation.Resource; - -import org.springframework.stereotype.Component; -import org.springframework.transaction.annotation.Transactional; - -import com.ossean.match.dao.AtomicItemDao; -import com.ossean.match.model.Atom; - -@Component -public class AtomicItemPipeline { - - @Resource - private AtomicItemDao atomicItemDao; - - // 把原子性的 - @Transactional - public void pipelineAtoms(Map> atoms) { - // TODO Auto-generated method stub - Set atomNames = atoms.keySet(); - ArrayList nums; - for (String name : atomNames) { - nums = atoms.get(name); - atomicItemDao.insertAtomicItem(name, nums.get(0), nums.get(1)); - } - } - - @Transactional - // 0: 未更新 - // 1: 数据更新 - // 2: 数据插入 - public void pipelineAtoms(List atoms) { - // TODO Auto-generated method stub - for (Atom atom : atoms) { - int status = atom.getStatus(); - switch (status) { - case 0: - break; - case 1: - atomicItemDao.updateAtomicItem(atom.getName(), - atom.getNumTag(), atom.getNumInProj()); - break; - default: - atomicItemDao.insertAtomicItem(atom.getName(), - atom.getNumTag(), atom.getNumInProj()); - } - - } - } - -} diff --git a/match_program/src/main/java/com/ossean/match/pipeline/ResultPipeline.java b/match_program/src/main/java/com/ossean/match/pipeline/ResultPipeline.java deleted file mode 100644 index e1bd171..0000000 --- a/match_program/src/main/java/com/ossean/match/pipeline/ResultPipeline.java +++ /dev/null @@ -1,163 +0,0 @@ -package com.ossean.match.pipeline; - -import java.util.HashSet; -import java.util.LinkedList; -import java.util.List; -import java.util.Set; - -import javax.annotation.Resource; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.springframework.stereotype.Component; -import org.springframework.transaction.annotation.Transactional; - -import com.ossean.match.dao.ResultDao; -import com.ossean.match.dao.TaggingDao; -import com.ossean.match.model.Project; - -@Component -public class ResultPipeline { - @Resource - private ResultDao resultDao; - @Resource - private TaggingDao taggingDao; - - private Logger logger = LoggerFactory.getLogger(getClass()); - - private LinkedList items1 = new LinkedList(); - private LinkedList items2 = new LinkedList(); - private LinkedList items3 = new LinkedList(); - private LinkedList projects = new LinkedList(); - - @Transactional - public void insertResult(int prjId, String string, int tagId, - String tagOriginalName, float score) { - - } - - @Transactional - public Set getPrjIdForMemo(int memoId) { - List tagIds = taggingDao.getTagIdForMemo(memoId); - Set result = new HashSet(); - for (int tagId : tagIds) { - List tempIntegers = resultDao.getMatchPrjId(tagId); - result.addAll(tempIntegers); - } - return result; - } - - public Set getTagbyPrjId(int prjId){ - List tags = resultDao.getMatchTag(prjId); - Set result = new HashSet(); - for (String tag : tags) { -// List tempIntegers = resultDao.getMatchPrjId(tagId); - result.add(tag); - } - return result; - } - - @Transactional - public void updateRank(Project project, int flag) { - if (flag == 1) { - projects.add(project); - if (projects.size() >= 10000) { - for (Project tproject : projects) { - System.out.println("handle project:" + tproject.getId()); - resultDao.updateRank(tproject.getRank(), - tproject.getStar(), tproject.getId()); - } - projects.clear(); - } - } else { - for (Project tproject : projects) { - System.out.println("handle project:" + tproject.getId()); - resultDao.updateRank(tproject.getRank(), tproject.getStar(), - tproject.getId()); - } - projects.clear(); - } - } - - @Transactional - public void insertResult1(int prjId, String prjName, int tagId, - String tagName, float score) { - items1.add(new Item(prjId, prjName, tagId, tagName, score)); - // 10000条输入数据库一次 - if (items1.size() >= 10000) { - for (Item item : items1) { - resultDao.insertResult1(item.getPrjId(), item.getPrjName(), - item.getTagId(), item.getTagName(), item.getScore()); - } - items1.clear(); - } - } - - @Transactional - public void insertResult2(int prjId, String prjName, int tagId, - String tagName, float score) { - items2.add(new Item(prjId, prjName, tagId, tagName, score)); - // 10000条输入数据库一次 - if (items2.size() >= 10000) { - for (Item item : items2) { - resultDao.insertResult2(item.getPrjId(), item.getPrjName(), - item.getTagId(), item.getTagName(), item.getScore()); - } - items2.clear(); - } - } - - @Transactional - public void insertResult3(int prjId, String prjName, int tagId, - String tagName, float score) { - items3.add(new Item(prjId, prjName, tagId, tagName, score)); - // 10000条输入数据库一次 - if (items3.size() >= 10000) { - logger.info("match tag and project :" + prjId); - for (Item item : items3) { - resultDao.insertResult3(item.getPrjId(), item.getPrjName(), - item.getTagId(), item.getTagName(), item.getScore()); - } - items3.clear(); - } - } -} - -class Item { - - public Item(int prjId, String prjName, int tagId, String tagName, - float score) { - super(); - this.tagName = tagName; - this.prjId = prjId; - this.prjName = prjName; - this.tagId = tagId; - this.score = score; - } - - public int getPrjId() { - return prjId; - } - - public String getPrjName() { - return prjName; - } - - public int getTagId() { - return tagId; - } - - public float getScore() { - return score; - } - - public String getTagName() { - return tagName; - } - - int prjId; - String prjName; - int tagId; - String tagName; - float score; -} diff --git a/match_program/src/main/resources/applicationContext_mybatis.xml b/match_program/src/main/resources/applicationContext_mybatis.xml index bfc2d83..f6deb8f 100644 --- a/match_program/src/main/resources/applicationContext_mybatis.xml +++ b/match_program/src/main/resources/applicationContext_mybatis.xml @@ -38,7 +38,7 @@ - + diff --git a/project_manager/bin/hotwords.sh b/project_manager/bin/hotwords.sh index 2c3acfe..3b26e22 100644 --- a/project_manager/bin/hotwords.sh +++ b/project_manager/bin/hotwords.sh @@ -16,4 +16,4 @@ JVM_ARGS="-Xmn98m -Xmx512m -Xms512m -XX:NewRatio=4 -XX:SurvivorRatio=4 -XX:MaxTe #echo JVM_ARGS=$JVM_ARGS #ulimit -n 400000 #echo "" > nohup.out -java $JVM_ARGS -classpath $CLASSPATH com.ossean.projectmanager.HotwordsMain >>log/hotwords.log 2>&1 & \ No newline at end of file +java $JVM_ARGS -classpath $CLASSPATH com.ossean.projectmanager.HotwordsMain > log/hotwords.log 2>&1 & \ No newline at end of file diff --git a/project_manager/bin/projectsfilter.sh b/project_manager/bin/projectsfilter.sh index 6a7900b..eb7daaa 100644 --- a/project_manager/bin/projectsfilter.sh +++ b/project_manager/bin/projectsfilter.sh @@ -17,4 +17,4 @@ JVM_ARGS="-Xmn98m -Xmx1024m -Xms512m -XX:NewRatio=4 -XX:SurvivorRatio=4 -XX:MaxT #echo JVM_ARGS=$JVM_ARGS #ulimit -n 400000 #echo "" > nohup.out -java $JVM_ARGS -classpath $CLASSPATH com.ossean.projectmanager.ProjectsFilterMain >>log/projectsfilter.log 2>&1 & \ No newline at end of file +java $JVM_ARGS -classpath $CLASSPATH com.ossean.projectmanager.ProjectsFilterMain > log/projectsfilter.log 2>&1 & \ No newline at end of file diff --git a/project_manager/bin/resources/applicationContext_mybatis.xml b/project_manager/bin/resources/applicationContext_mybatis.xml index 7935118..6c5596e 100644 --- a/project_manager/bin/resources/applicationContext_mybatis.xml +++ b/project_manager/bin/resources/applicationContext_mybatis.xml @@ -39,9 +39,9 @@ destroy-method="close"> + value="jdbc:mysql://172.16.128.36:3306/ossean_production?characterEncoding=UTF-8&zeroDateTimeBehavior=convertToNull&autoReconnect=true" /> - + diff --git a/project_manager/bin/syncmatchresult.sh b/project_manager/bin/syncmatchresult.sh index fc917d6..c41c0f3 100644 --- a/project_manager/bin/syncmatchresult.sh +++ b/project_manager/bin/syncmatchresult.sh @@ -17,4 +17,4 @@ JVM_ARGS="-Xmn98m -Xmx1024m -Xms512m -XX:NewRatio=4 -XX:SurvivorRatio=4 -XX:MaxT #echo JVM_ARGS=$JVM_ARGS #ulimit -n 400000 #echo "" > nohup.out -java $JVM_ARGS -classpath $CLASSPATH com.ossean.projectmanager.SyncMatchResultMain >>log/syncmatchresult.log 2>&1 & \ No newline at end of file +java $JVM_ARGS -classpath $CLASSPATH com.ossean.projectmanager.SyncMatchResultMain > log/syncmatchresult.log 2>&1 & \ No newline at end of file diff --git a/project_manager/bin/updateosptags.sh b/project_manager/bin/updateosptags.sh index fdde324..2289141 100644 --- a/project_manager/bin/updateosptags.sh +++ b/project_manager/bin/updateosptags.sh @@ -17,4 +17,4 @@ JVM_ARGS="-Xmn98m -Xmx512m -Xms512m -XX:NewRatio=4 -XX:SurvivorRatio=4 -XX:MaxTe #echo JVM_ARGS=$JVM_ARGS #ulimit -n 400000 #echo "" > nohup.out -java $JVM_ARGS -classpath $CLASSPATH com.ossean.projectmanager.UpdateOspTagsMain >>log/updateosptags.log 2>&1 & \ No newline at end of file +java $JVM_ARGS -classpath $CLASSPATH com.ossean.projectmanager.UpdateOspTagsMain > log/updateosptags.log 2>&1 & \ No newline at end of file diff --git a/project_manager/src/main/java/com/ossean/projectmanager/UpdateOspTagsMain.java b/project_manager/src/main/java/com/ossean/projectmanager/UpdateOspTagsMain.java index 9eadf04..2551d6a 100644 --- a/project_manager/src/main/java/com/ossean/projectmanager/UpdateOspTagsMain.java +++ b/project_manager/src/main/java/com/ossean/projectmanager/UpdateOspTagsMain.java @@ -16,7 +16,8 @@ public class UpdateOspTagsMain { private UpdateOspTags updateTags; public void start(){ - updateTags.updateOspTags(); +// updateTags.updateOspTags(); + updateTags.restorePrjTaggings(); } public static void main(String[] args){ diff --git a/project_manager/src/main/java/com/ossean/projectmanager/hotwords/UpdateOspTags.java b/project_manager/src/main/java/com/ossean/projectmanager/hotwords/UpdateOspTags.java index 8382421..c6d38ef 100644 --- a/project_manager/src/main/java/com/ossean/projectmanager/hotwords/UpdateOspTags.java +++ b/project_manager/src/main/java/com/ossean/projectmanager/hotwords/UpdateOspTags.java @@ -13,6 +13,7 @@ import com.ossean.projectmanager.lasttabledao.TagDao; import com.ossean.projectmanager.lasttabledao.TaggingsDao; import com.ossean.projectmanager.model.OpenSourceProject; import com.ossean.projectmanager.model.Taggings; +import com.ossean.projectmanager.utils.DataHandler; /** * @@ -28,6 +29,9 @@ public class UpdateOspTags { private TagDao tagDao; @Resource private TaggingsDao taggingsDao; + + private String taggingsTableName = "taggings"; + private String tagsTableName = "tags"; /** * 根据taggings更新项目标签字段tags和权重更高的标签字段tags_for_search @@ -67,5 +71,50 @@ public class UpdateOspTags { } } + + public void restorePrjTaggings(){ + logger.info("start updateOspTags ......"); + int start = 0; + int prjId = 0; + while (start < 2000000) { + List ospList = ospDao.getProjectsByBatch(start, + 5000); + for (OpenSourceProject osp : ospList) { + prjId = osp.getId(); + // 将tag和项目的关系存入表item_tag_relation 并分离tag + String tags = osp.getTags(); + if (tags == null) { + // 表示该项目没有标签 + continue; + } + List tagList = DataHandler + .tagsSegmentation(tags); + for (String tag : tagList) { + ospDao.insertTag(tagsTableName, tag);// ignore方式插入该项目的标签 + int tag_id = ospDao.selectTagIdByName( + tagsTableName, tag); + Taggings taggings = new Taggings(); + taggings.setTag_id(tag_id); + taggings.setTaggable_id(prjId); + taggings.setTaggable_type("OpenSourceProject"); + taggings.setContext("tags"); + taggings.setCreated_at(DataHandler.getNow()); + // 将Taggings对象存入数据库中 + try { + ospDao.insertTaggings(taggingsTableName, + taggings); + } catch (Exception e) { + // 在插入记录之前 relative_memos表中的记录已经被删除掉了 + logger.error("insert taggins error: " + e); + System.exit(0); + } + } + logger.info("currentPrjId: " + prjId); + } + logger.info("last prj batch end, currentPrjId: " + prjId); + start = prjId + 5000; + } + + } } diff --git a/project_manager/src/main/java/com/ossean/projectmanager/lasttabledao/OpenSourceProjectDao.java b/project_manager/src/main/java/com/ossean/projectmanager/lasttabledao/OpenSourceProjectDao.java index fd70bad..988c9af 100644 --- a/project_manager/src/main/java/com/ossean/projectmanager/lasttabledao/OpenSourceProjectDao.java +++ b/project_manager/src/main/java/com/ossean/projectmanager/lasttabledao/OpenSourceProjectDao.java @@ -2,15 +2,32 @@ package com.ossean.projectmanager.lasttabledao; import java.util.List; +import org.apache.ibatis.annotations.Insert; import org.apache.ibatis.annotations.Param; import org.apache.ibatis.annotations.Select; import org.apache.ibatis.annotations.Update; import com.ossean.projectmanager.model.OpenSourceProject; +import com.ossean.projectmanager.model.Taggings; public interface OpenSourceProjectDao { - - //获得最大项目id + + // 向tag表存储数据 + @Insert("insert ignore into ${table} (name) values (#{name})") + public void insertTag(@Param("table") String table, + @Param("name") String name); + + // 根据tag的name找tag的id + @Select("select id from ${table} where name=#{name}") + public int selectTagIdByName(@Param("table") String table, + @Param("name") String name); + + // 插入Taggings对象 + @Insert("insert ignore into ${table} (tag_id,taggable_id,taggable_type,context,created_at,disagree_num) values (#{model.tag_id},#{model.taggable_id},#{model.taggable_type},#{model.context},#{model.created_at},#{model.disagree_num})") + public void insertTaggings(@Param("table") String table, + @Param("model") Taggings model); + + // 获得最大项目id @Select("select MAX(id) from open_source_projects") public Integer getNewLast(); @@ -19,10 +36,9 @@ public interface OpenSourceProjectDao { public List getProjectsByBatch( @Param("start") int start, @Param("size") int size); - //更新项目标签字段tags和权重更高的标签字段tags_for_search + // 更新项目标签字段tags和权重更高的标签字段tags_for_search @Update("update open_source_projects set tags=#{tags}, tags_for_search = #{tagsForSearch} where id=#{id}") - public void updatePrjTags(@Param("id") int id, - @Param("tags") String tags, + public void updatePrjTags(@Param("id") int id, @Param("tags") String tags, @Param("tagsForSearch") String tagsForSearch); // 对项目标签属性进行更新 diff --git a/project_manager/src/main/java/com/ossean/projectmanager/projectsfilter/ProjectsFilter.java b/project_manager/src/main/java/com/ossean/projectmanager/projectsfilter/ProjectsFilter.java index 28e6c52..05df7cc 100644 --- a/project_manager/src/main/java/com/ossean/projectmanager/projectsfilter/ProjectsFilter.java +++ b/project_manager/src/main/java/com/ossean/projectmanager/projectsfilter/ProjectsFilter.java @@ -68,9 +68,9 @@ public class ProjectsFilter { .getOpenHubPrjByUrl(url); // 根据url从openhub的项目分表获得项目信息 if (openhubProject != null && openhubProject.getName() != null - && openhubProject.getName() != "" + && !"".equals(openhubProject.getName()) && openhubProject.getDescription() != null - && openhubProject.getDescription() != "" + && !"".equals(openhubProject.getDescription()) && openhubProject.getCodeRepository() != null && !openhubProject.getCodeRepository().contains( "add a code location")) { // openhub的筛选条件为name、description不为空,且该项目有版本库 @@ -80,10 +80,11 @@ public class ProjectsFilter { matchResultDao.deleteMatchResult( getTargetTable(project.getId()), project.getId()); // 删除该项目的匹配结果,确保无之前的匹配结果 - } else { - lastProjectDao.updateFiltratedPrj(project.getId(), - 2); // 筛选标识由1或2变为2,表示该项目之前就是筛选作为保留的 - } + } +// else { +// lastProjectDao.updateFiltratedPrj(project.getId(), +// 2); // 筛选标识由1或2变为2,表示该项目之前就是筛选作为保留的 +// } } else { lastProjectDao.updateFiltratedPrj(project.getId(), 0); // 筛选标识变为0,表示该项目不保留 matchResultDao.deleteMatchResult( @@ -95,9 +96,9 @@ public class ProjectsFilter { .getSourceForgePrjByUrl(url); // 根据url从SourceForge的项目分表获得项目信息 if (sourceforgeProject != null && sourceforgeProject.getName() != null - && sourceforgeProject.getName() != "" + && !"".equals(sourceforgeProject.getName()) && sourceforgeProject.getDescription() != null - && sourceforgeProject.getDescription() != "" + && !"".equals(sourceforgeProject.getDescription()) && ((sourceforgeProject.getDownload_num() > 0) || (sourceforgeProject .getStars() > 0))) { if (project.getFilration() == 0) { @@ -106,10 +107,11 @@ public class ProjectsFilter { matchResultDao.deleteMatchResult( getTargetTable(project.getId()), project.getId()); // 删除该项目的匹配结果,确保无之前的匹配结果 - } else { - lastProjectDao.updateFiltratedPrj(project.getId(), - 2); // 筛选标识由1或2变为2,表示该项目之前就是筛选作为保留的 - } + } +// else { +// lastProjectDao.updateFiltratedPrj(project.getId(), +// 2); // 筛选标识由1或2变为2,表示该项目之前就是筛选作为保留的 +// } } else { lastProjectDao.updateFiltratedPrj(project.getId(), 0); // 筛选标识变为0,表示该项目不保留 matchResultDao.deleteMatchResult( @@ -122,9 +124,10 @@ public class ProjectsFilter { matchResultDao.deleteMatchResult( getTargetTable(project.getId()), project.getId()); // 删除该项目的匹配结果,确保无之前的匹配结果 - } else { - lastProjectDao.updateFiltratedPrj(project.getId(), 2); // 筛选标识由1或2变为2,表示该项目之前就是筛选作为保留的 - } + } +// else { +// lastProjectDao.updateFiltratedPrj(project.getId(), 2); // 筛选标识由1或2变为2,表示该项目之前就是筛选作为保留的 +// } } else { logger.info("Unknown source... source = " + source); } diff --git a/project_manager/src/main/java/com/ossean/projectmanager/utils/DataHandler.java b/project_manager/src/main/java/com/ossean/projectmanager/utils/DataHandler.java new file mode 100644 index 0000000..be040fc --- /dev/null +++ b/project_manager/src/main/java/com/ossean/projectmanager/utils/DataHandler.java @@ -0,0 +1,38 @@ +package com.ossean.projectmanager.utils; + +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Date; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class DataHandler { + + // 获取当前时间的String + public static String getNow() { + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); + String result = sdf.format(new Date()); + return result; + } + + // 标签分离函数 + public static List tagsSegmentation(String tags) { + List tag = new ArrayList(); + + if (tags != null) { + String regex = "<[^<>]*>"; + Pattern pattern = Pattern.compile(regex); + Matcher matcher = pattern.matcher(tags); + + while (matcher.find()) { + String t = matcher.group(); + t = t.substring(1, t.length() - 1); + + tag.add(t); + } + } + return tag; + } + +} diff --git a/project_match/bin/resources/applicationContext_mybatis.xml b/project_match/bin/resources/applicationContext_mybatis.xml index 6737896..c698761 100644 --- a/project_match/bin/resources/applicationContext_mybatis.xml +++ b/project_match/bin/resources/applicationContext_mybatis.xml @@ -19,9 +19,9 @@ destroy-method="close"> + value="jdbc:mysql://localhost:3306/ossean_production?characterEncoding=UTF-8&zeroDateTimeBehavior=convertToNull&autoReconnect=true" /> - + diff --git a/project_match/pom.xml b/project_match/pom.xml index 699080b..e9148f4 100644 --- a/project_match/pom.xml +++ b/project_match/pom.xml @@ -46,7 +46,7 @@ junit junit - 3.8.1 + 4.10 test diff --git a/project_match/src/main/java/com/ossean/ClearTable.java b/project_match/src/main/java/com/ossean/ClearTable.java index 26785a9..b98ed69 100644 --- a/project_match/src/main/java/com/ossean/ClearTable.java +++ b/project_match/src/main/java/com/ossean/ClearTable.java @@ -16,15 +16,35 @@ public class ClearTable { Logger logger = Logger.getLogger(ClearTable.class); public void start(){ - truncateTable("edd_pointers"); + truncateTable("synonyms"); truncateTable("edd_relations"); truncateTable("synonymmings"); truncateTable("open_source_projects"); - deleteTaggings4Project(); + deleteItemInEddPointers("gather_projects", "edd_relations"); + //deleteTaggings4Project(); logger.info("完成去重程序运行前数据表的清空和taggings表OpenSourceProject的删除"); } - + //删除edd_pointers表中transfer对应的数据 + public void deleteItemInEddPointers(String sourceTableName, String targetTableName){ + String sql = "delete from edd_pointers where SourceTableName=? and TargetTableName=?"; + Connection conn = getConnection(); + if(conn == null){ + logger.info("没有获取到Connection"); + System.exit(-1); + } + try { + PreparedStatement ps = conn.prepareStatement(sql); + ps.setString(1, sourceTableName); + ps.setString(2, targetTableName); + ps.execute(); + ps.close(); + conn.close(); + } catch (SQLException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } public void truncateTable(String table){ String sql = "truncate table " + table; Connection conn = getConnection(); @@ -65,7 +85,7 @@ public class ClearTable { Connection conn = null; try { Class.forName("com.mysql.jdbc.Driver").newInstance(); - conn = DriverManager.getConnection("jdbc:mysql://172.16.128.30:3306/ossean", "root", "123456"); + conn = DriverManager.getConnection("jdbc:mysql://localhost:3306/ossean_production", "root", "NUDTpdl@123"); } catch (InstantiationException e) { e.printStackTrace(); } catch (IllegalAccessException e) { diff --git a/project_match/src/main/java/com/ossean/MergeProjects.java b/project_match/src/main/java/com/ossean/MergeProjects.java index bb19c09..a752b7d 100644 --- a/project_match/src/main/java/com/ossean/MergeProjects.java +++ b/project_match/src/main/java/com/ossean/MergeProjects.java @@ -1,98 +1,113 @@ -package com.ossean; - -import java.util.List; - -import javax.annotation.Resource; - -import org.apache.log4j.Logger; -import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.beans.factory.annotation.Qualifier; -import org.springframework.context.ApplicationContext; -import org.springframework.context.support.ClassPathXmlApplicationContext; -import org.springframework.stereotype.Component; - -import com.ossean.databaseSource.DBSource; -import com.ossean.databaseSource.GatherDao; -import com.ossean.databaseSource.UpdateControlProjectsDao; -import com.ossean.model.GatherProjectsModel; -import com.ossean.util.MergeProjectNew; -import com.ossean.util.MergeProjectsUtil; - -@Component -public class MergeProjects { - Logger logger = Logger.getLogger(this.getClass()); - @Resource - private DBSource dbSource; - @Resource - private GatherDao gatherDao; - @Resource - private UpdateControlProjectsDao updateControlDao; - - @Qualifier("mergeProjectNew") - @Autowired - private MergeProjectNew mergeProjectNew; - - private int startId1; - - private static String pointerTableName = "edd_pointers"; - private static String sourceTableName = "gather_projects"; - private static String targetTableName = "edd_relations"; - - private static int batchSize = 500; - - //读指针 - public int readPointer(String table, String source, String target, int minId){ - int pointer = minId; - try { - pointer = dbSource.getPointer(table, source, target); - } catch(Exception e) { - logger.info("No such pointer! Create one"); - dbSource.insertPointer(table, source, target, pointer); - } - return pointer; - } - - public void start(){ - int minId = 1; - int maxId = dbSource.getMaxId(sourceTableName); - - while(true){ - startId1 = readPointer(pointerTableName, sourceTableName, targetTableName, minId); - if(startId1 > maxId){ - logger.info("finish program! HAHAHA"); - System.exit(0); - } - List gpmList1 = gatherDao.selectGPMList(sourceTableName, startId1, batchSize, maxId); - if(gpmList1.size() == 0){ - try { - logger.info("no increase item to be handled! handle update items"); - List updateProjectList = gatherDao.selectUpdateProjects(sourceTableName); - for(GatherProjectsModel model:updateProjectList){ - mergeProjectNew.handleNewProject(model, true); - gatherDao.updateUpdateMark(sourceTableName, null, model.getId()); - } - logger.info("all update items have been solved. Sleep 1h"); - Thread.sleep(3600*1000); - continue; - } catch (InterruptedException e) { - e.printStackTrace(); - } - } - for(GatherProjectsModel model1:gpmList1){ - logger.info("handling project : " + model1.getId()); - mergeProjectNew.handleNewProject(model1, false); - - //如果是对更新表中的数据进行操作 则不更新指针 - dbSource.updatePointer(pointerTableName, sourceTableName, targetTableName, model1.getId() + 1); - gatherDao.updateUpdateMark(sourceTableName, null, model1.getId()); //需要更新update_mark字段 因为按顺序取的时候可能取到1的 - } - } - } - - - public static void main(String[] args){ - ApplicationContext applicationContext = new ClassPathXmlApplicationContext("classpath:/applicationContext*.xml"); - MergeProjects Main = applicationContext.getBean(MergeProjects.class); - Main.start(); - } -} +package com.ossean; + +import java.util.List; + +import javax.annotation.Resource; + +import org.apache.log4j.Logger; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.beans.factory.annotation.Qualifier; +import org.springframework.context.ApplicationContext; +import org.springframework.context.support.ClassPathXmlApplicationContext; +import org.springframework.stereotype.Component; + +import com.ossean.databaseSource.DBSource; +import com.ossean.databaseSource.GatherDao; +import com.ossean.databaseSource.UpdateControlProjectsDao; +import com.ossean.model.GatherProjectsModel; +import com.ossean.util.MergeProjectNew; +import com.ossean.util.MergeProjectsUtil; + +@Component +public class MergeProjects { + Logger logger = Logger.getLogger(this.getClass()); + @Resource + private DBSource dbSource; + @Resource + private GatherDao gatherDao; + @Resource + private UpdateControlProjectsDao updateControlDao; + + @Qualifier("mergeProjectNew") + @Autowired + private MergeProjectNew mergeProjectNew; + + private int startId1; + + private static String pointerTableName = "edd_pointers"; + private static String sourceTableName = "gather_projects"; + private static String targetTableName = "edd_relations"; + + private static int batchSize = 500; + + //读指针 + public int readPointer(String table, String source, String target, int minId){ + int pointer = minId; + try { + pointer = dbSource.getPointer(table, source, target); + } catch(Exception e) { + logger.info("No such pointer! Create one"); + dbSource.insertPointer(table, source, target, pointer); + } + return pointer; + } + + public void start(){ + //int minId = 1; + //int maxId = dbSource.getMaxId(sourceTableName); + int count=0; + count = readPointer(pointerTableName, sourceTableName, targetTableName, count); + + while(true){ + //startId1 = readPointer(pointerTableName, sourceTableName, targetTableName, minId); +// if(startId1 > maxId){ +// +// try { +// logger.info("没有数据了,休眠10min"); +// Thread.sleep(600*1000L); +// continue; +// } catch (InterruptedException e) { +// e.printStackTrace(); +// } +// } + List gpmList1 = gatherDao.selectGPMList(sourceTableName,batchSize); + + if(gpmList1.size() == 0){ + try { + logger.info("no increase item to be handled! handle update items"); + List updateProjectList = gatherDao.selectUpdateProjects(sourceTableName,batchSize); + if(updateProjectList.size() == 0){ + logger.info("all update items have been solved. Sleep 10 mins"); + Thread.sleep(600*1000); + continue; + } + else + for(GatherProjectsModel model:updateProjectList){ + mergeProjectNew.handleNewProject(model, true); + gatherDao.updateUpdateMark(sourceTableName, 1, model.getId()); + } + + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + long start = System.currentTimeMillis(); + for(GatherProjectsModel model1:gpmList1){ + logger.info("handling project : " + model1.getId()); + mergeProjectNew.handleNewProject(model1, false); + count = count+1; + dbSource.updatePointer(pointerTableName, sourceTableName, targetTableName, count); + gatherDao.updateUpdateMark(sourceTableName, 1, model1.getId()); //需要更新update_mark字段 因为按顺序取的时候可能取到1的 + } + long end = System.currentTimeMillis(); + logger.warn("deal with: 500 projects cost: "+(float)(end - start)/60000+" minutes"); + } + } + + + public static void main(String[] args){ + ApplicationContext applicationContext = new ClassPathXmlApplicationContext("classpath:/applicationContext*.xml"); + MergeProjects Main = applicationContext.getBean(MergeProjects.class); + Main.start(); + } +} diff --git a/project_match/src/main/java/com/ossean/MergeProjectsOld.java b/project_match/src/main/java/com/ossean/MergeProjectsOld.java index a78290f..2454c60 100644 --- a/project_match/src/main/java/com/ossean/MergeProjectsOld.java +++ b/project_match/src/main/java/com/ossean/MergeProjectsOld.java @@ -73,7 +73,7 @@ public class MergeProjectsOld { }else{} while(true){ startId1 = readPointer(pointerTableName, sourceTableName, edd_relations_pointer, minId); - List gpmList1 = gatherDao.selectGPMList(sourceTableName, startId1, batchSize, maxId); + List gpmList1 = gatherDao.selectGPMList(sourceTableName, batchSize); if(gpmList1.size() == 0){ try { logger.info("no increase item to be handled! handle update items"); @@ -92,7 +92,7 @@ public class MergeProjectsOld { List updateProjectList = gatherDao.selectUpdateProjects(sourceTableName); for(GatherProjectsModel model:updateProjectList){ mergeProjectNew.handleNewProject(model, true); - gatherDao.updateUpdateMark(sourceTableName, null, model.getId()); + gatherDao.updateUpdateMark(sourceTableName, 1, model.getId()); } logger.info("all update items have been solved. Sleep 1h"); Thread.sleep(3600*1000); @@ -110,7 +110,7 @@ public class MergeProjectsOld { //如果是对更新表中的数据进行操作 则不更新指针 dbSource.updatePointer(pointerTableName, sourceTableName, edd_relations_pointer, model1.getId() + 1); - gatherDao.updateUpdateMark(sourceTableName, null, model1.getId()); //需要更新update_mark字段 因为按顺序取的时候可能取到1的 + gatherDao.updateUpdateMark(sourceTableName, 1, model1.getId()); //需要更新update_mark字段 因为按顺序取的时候可能取到1的 } } } diff --git a/project_match/src/main/java/com/ossean/databaseDest/DBDest.java b/project_match/src/main/java/com/ossean/databaseDest/DBDest.java index 992fdce..d4fa82b 100644 --- a/project_match/src/main/java/com/ossean/databaseDest/DBDest.java +++ b/project_match/src/main/java/com/ossean/databaseDest/DBDest.java @@ -26,13 +26,13 @@ public interface DBDest { */ //向open_source_projects表中插入对象数据 @Insert("insert into ${table} (id,name,description," - + "url,url_md5,language,category," + + "url,language,category," + "source,created_time,updated_time," - + "tags,tags_for_search,synonyms,license,homepage,extracted_time,update_mark) values (#{model.id},#{model.name},#{model.description}," - + "#{model.url},#{model.url_md5},#{model.language}," + + "tags,tags_for_search,synonyms,update_mark) values (#{model.id},#{model.name},#{model.description}," + + "#{model.url},,#{model.language}," + "#{model.category},#{model.source}," + "#{model.created_time},#{model.updated_time},#{model.tags}," - + "#{model.tags_for_search},#{model.synonyms},#{model.license},#{model.homepage},#{model.extracted_time},#{model.update_mark})") + + "#{model.tags_for_search},#{model.synonyms},#{model.update_mark})") public void insertOsp(@Param("table") String table, @Param("model") OpenSourceProject model); //查找刚刚插入open_source_projects表中的记录id diff --git a/project_match/src/main/java/com/ossean/databaseSource/GatherDao.java b/project_match/src/main/java/com/ossean/databaseSource/GatherDao.java index 7cf9df5..6868e83 100644 --- a/project_match/src/main/java/com/ossean/databaseSource/GatherDao.java +++ b/project_match/src/main/java/com/ossean/databaseSource/GatherDao.java @@ -87,8 +87,10 @@ public interface GatherDao { //根据id和limit取GatherProjectsModel列表 - @Select("select * from ${table} where id>=#{start} and id<=#{maxId} order by id asc limit #{size}") - public List selectGPMList(@Param("table") String table, @Param("start") int start, @Param("size") int size, @Param("maxId") int maxId); + //@Select("select * from ${table} where id>=#{start} and id<=#{maxId} order by id asc limit #{size}") + //public List selectGPMList(@Param("table") String table, @Param("start") int start, @Param("size") int size, @Param("maxId") int maxId); + @Select("select * from ${table} where update_mark = 0 limit #{size}") + public List selectGPMList(@Param("table") String table, @Param("size") int size); @Select("select * from ${table} where id>=#{start} and id<#{end} limit #{size}") public List selectGPMListBySize(@Param("table") String table, @Param("start") int start, @Param("end") int end, @Param("size") int size); @@ -106,12 +108,12 @@ public interface GatherDao { public List selectGPMBySameName(@Param("table") String table, @Param("name") String name, @Param("id") int id); - //取出所有update_mark为1的更新数据 - @Select("select * from ${table} where update_mark=1") - public List selectUpdateProjects(@Param("table") String table); + //取出所有update_mark为2的更新数据 + @Select("select * from ${table} where update_mark=2 limit #{size}") + public List selectUpdateProjects(@Param("table") String table,@Param("size") int size); //更新update_mark字段 @Select("update ${table} set update_mark=#{update_mark} where id=#{id}") - public void updateUpdateMark(@Param("table") String table, @Param("update_mark") String update_mark, @Param("id") int id); + public void updateUpdateMark(@Param("table") String table, @Param("update_mark") int update_mark, @Param("id") int id); } \ No newline at end of file diff --git a/project_match/src/main/java/com/ossean/util/StringHandler.java b/project_match/src/main/java/com/ossean/util/StringHandler.java index 54d195f..3411d34 100644 --- a/project_match/src/main/java/com/ossean/util/StringHandler.java +++ b/project_match/src/main/java/com/ossean/util/StringHandler.java @@ -45,7 +45,7 @@ public class StringHandler { } } resultStrArr = tmp.toArray(new String[0]); -System.out.println("bangbangbang"+resultStrArr[0]+" "+resultStrArr[1]); +//System.out.println("bangbangbang"+resultStrArr[0]+" "+resultStrArr[1]); return resultStrArr; } diff --git a/project_match/src/main/resource/log4j.xml b/project_match/src/main/resource/log4j.xml index dbbef90..a368b6c 100644 --- a/project_match/src/main/resource/log4j.xml +++ b/project_match/src/main/resource/log4j.xml @@ -23,6 +23,16 @@ + + + + + + + + + + @@ -47,13 +57,13 @@ - - + + \ No newline at end of file diff --git a/project_match/src/test/java/foo/Testcase.java b/project_match/src/test/java/foo/Testcase.java deleted file mode 100644 index a065f73..0000000 --- a/project_match/src/test/java/foo/Testcase.java +++ /dev/null @@ -1,126 +0,0 @@ -package foo; - -import java.util.ArrayList; -import java.util.List; -import java.util.Map; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import javax.annotation.Resource; - -import org.apache.log4j.Logger; -import org.junit.Test; -import org.junit.runner.RunWith; -import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.beans.factory.annotation.Qualifier; -import org.springframework.context.ApplicationContext; -import org.springframework.test.context.ContextConfiguration; -import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; - -import com.ossean.databaseSource.DBSource; -import com.ossean.databaseSource.GatherDao; -import com.ossean.model.GatherProjectsModel; -import com.ossean.util.MergeProjectNew; -import com.ossean.util.MergeProjectsUtil; -import com.ossean.util.RegexHandler; -import com.ossean.util.StringHandler_ProjectName; - -@RunWith(SpringJUnit4ClassRunner.class) -@ContextConfiguration(locations = {"classpath*:/applicationContext*.xml"}) -public class Testcase { - - @Autowired - ApplicationContext ctx; - - Logger logger = Logger.getLogger(this.getClass()); - @Resource - private DBSource dbSource; - @Resource - private GatherDao gatherDao; - - @Qualifier("mergeProjectsUtil") - @Autowired - private MergeProjectsUtil mergeProjectsUtil; - - @Qualifier("mergeProjectNew") - @Autowired - private MergeProjectNew mergeProjectNew; - - - public static void main(String[] args) { - - - } - - @Test - public void testGetSynonyms(){ - List prj_list = gatherDao.selectGPMList("gather_projects_test", 11000, 100, 1135058); - GatherProjectsModel gpm = gatherDao.selectGPMById("gather_projects", 11085);//1220,1995,1194,11085 - GatherProjectsModel gpm2 = gatherDao.selectGPMById("gather_projects", 237094); - List list= mergeProjectNew.getSynonyms(gpm); - String str = ""; - for (int j = 0;j list= mergeProjectNew.getSynonyms(prj_list.get(i)); -// -// String str = ""; -// for (int j = 0;j synonymList = mergeProjectsUtil.getSynonyms(gpm_2); -// for (int i =0;i tagsSegmentation(String tags) { - List tag = new ArrayList(); - - if (tags != null) { - String regex = "<[^<>]*>"; - //一个Pattern是一个正则表达式经编译后的表现模式。 - Pattern pattern = Pattern.compile(regex); - //一个Matcher对象是一个状态机器,它依据Pattern对象做为匹配模式对字符串展开匹配检查。 - Matcher matcher = pattern.matcher(tags); - - while (matcher.find()) { - String t = matcher.group(); - t = t.substring(1, t.length() - 1); - - tag.add(t); - } - } - return tag; - } - -} diff --git a/trustie2/Gemfile b/trustie2/Gemfile index babf8a2..61f1793 100644 --- a/trustie2/Gemfile +++ b/trustie2/Gemfile @@ -1,5 +1,10 @@ -source 'http://ruby.taobao.org/' +<<<<<<< HEAD +#source 'http://ruby.taobao.org/' +source 'https://rubygems.org' +======= +source 'https://ruby.taobao.org/' #source 'https://rubygems.org' +>>>>>>> tmp unless RUBY_PLATFORM =~ /w32/ # unix-like only diff --git a/trustie2/app/controllers/open_source_projects_controller.rb b/trustie2/app/controllers/open_source_projects_controller.rb index 8b95594..486f8aa 100644 --- a/trustie2/app/controllers/open_source_projects_controller.rb +++ b/trustie2/app/controllers/open_source_projects_controller.rb @@ -60,11 +60,11 @@ class OpenSourceProjectsController < ApplicationController #@open_source_project.update_column(:view_num_ossean, (@open_source_project.view_num_ossean.to_i+1)) #@open_source_project.save - @pops=@open_source_project.open_source_project_popularities.where("year_col >= 2013").order("year_col ASC, month_col ASC") - @hot_words=@open_source_project.hot_words + # @pops=@open_source_project.open_source_project_popularities.where("year_col >= 2013").order("year_col ASC, month_col ASC") + # @hot_words=@open_source_project.hot_words @memo = RelativeMemo.new(:open_source_project => @open_source_project) @topics_count = @open_source_project.relative_memos_num - @bugs = @open_source_project.relative_bugs.limit(6) + # @bugs = @open_source_project.relative_bugs.limit(6) ###########推荐############################################################# @simility_osp_array ,@simility_id_array,@simility_weight_array ,@simility_osp_des= recommend(params[:id],"simility") @@ -205,6 +205,32 @@ class OpenSourceProjectsController < ApplicationController language = '' language << @language #search_words << ' ' + @language if @language + + # 请求starlee服务 + require 'net/http' + + star_projects = [] + parse_string = "http://172.16.128.35:5000/correlation_search?q=" + @name + + # parse_string = "http://172.20.10.3:5000/correlation_search?q=" + @name + parse_string.gsub! " ","%20" + url = URI.parse(parse_string) + req = Net::HTTP::Get.new(url.to_s) + res = Net::HTTP.start(url.host, url.port) {|http| + http.request(req) + } + star_items = res.body + require 'json' + star_items = JSON.parse(star_items)["items"] + star_project_ids = star_items[0,5] + star_projects = [] + # 查询出id对应的项目 + star_project_ids.each do |id| + id = id.to_i + project = OpenSourceProject.find_by_id(id) + star_projects << project + end + search = OpenSourceProject.search do without(:filtration,0) with(:created_time, Date.new(params[:created_time].to_i, 01, 01)..Date.new(params[:created_time].to_i+1, 01, 01)) if params[:created_time].present? && !('earlier'.eql? params[:created_time]) @@ -240,12 +266,36 @@ class OpenSourceProjectsController < ApplicationController else order_by(:score, :desc) end - paginate :page => params[:page], :per_page => 10 #the default paginator of solr,10 records for each page + paginate :page => params[:page], :per_page => 10 # the default paginator of solr,10 records for each page end per_page_option = 10 @hits = search.hits @open_source_projects = search.results @projects_count = search.total #get total count of search + + open_source_projects_index = 0 + while open_source_projects_index<@open_source_projects.length do + project = @open_source_projects[open_source_projects_index] + id = project.id + star_projects.each do |star_project| + if id.to_i == star_project.id + @open_source_projects.delete(project) + open_source_projects_index -= 1 + break + end + end + open_source_projects_index += 1 + end + + if params[:page]==nil || params[:page]==1 + star_index = star_projects.length-1 + while star_index>=0 do + star_project = star_projects[star_index] + @open_source_projects.unshift(star_project) + star_index -= 1 + end + end + # if @projects_count <= 10 # search_words1 = search_words.unpack("U*").select{ |p| (0x4e00..0x9fa5).member? p }.pack("U*") # if search_words.include?('客户端') @@ -271,6 +321,8 @@ class OpenSourceProjectsController < ApplicationController # @projects_count = search.total + search_1.total # end @projects_pages = Paginator.new @projects_count, per_page_option, params['page'] #custom paginator,to show the 10 records the default paginator of solr gets + + respond_to do |format| format.html { render :layout => "base" } end @@ -408,7 +460,8 @@ class OpenSourceProjectsController < ApplicationController @created_time = params[:created_time] @projects_count=OpenSourceProject.count - @posts_count = RelativeMemo.count + # @posts_count = RelativeMemo.count + @posts_count = 14243800 #@open_source_projects=OpenSourceProject.find_hot_osps(20) @projects_for_cloud = OpenSourceProject.where("filtration != 0").order("composite_score desc").limit(60) diff --git a/trustie2/app/helpers/application_helper.rb b/trustie2/app/helpers/application_helper.rb index a59eca7..d9ca9c5 100644 --- a/trustie2/app/helpers/application_helper.rb +++ b/trustie2/app/helpers/application_helper.rb @@ -1689,11 +1689,11 @@ module ApplicationHelper # nav_list.push(stores_link) if @nav_dispaly_store_all_label #osp_index - nav_list.push(osp_index_link) if @nav_dispaly_osp_index_label + # nav_list.push(osp_index_link) if @nav_dispaly_osp_index_label #highchart_index - nav_list.push(highchart_index_link) if @nav_dispaly_osp_index_label + # nav_list.push(highchart_index_link) if @nav_dispaly_osp_index_label #sta_git_match - nav_list.push(sta_git_match_link) if @nav_dispaly_osp_index_label + # nav_list.push(sta_git_match_link) if @nav_dispaly_osp_index_label content_li = '' nav_list.collect do |nav_item| diff --git a/trustie2/app/models/open_source_project.rb b/trustie2/app/models/open_source_project.rb index 04e73b0..5c12fb4 100644 --- a/trustie2/app/models/open_source_project.rb +++ b/trustie2/app/models/open_source_project.rb @@ -246,12 +246,12 @@ class OpenSourceProject < ActiveRecord::Base #tag_ids = ActsAsTaggableOn::Tagging.select(:tag_id).where("taggable_id = ? AND taggable_type = ? AND disagree_num > ?", obj_id, "OpenSourceProject", 5) # ActsAsTaggableOn::Tag.includes(ActsAsTaggableOn::Tagging).where("taggable_id = ? AND taggable_type = ? AND disagree_num > ?", obj_id, "OpenSourceProject", -1) #tag_list = ActsAsTaggableOn::Tag.select(:name).where(id:tag_ids) - tag_list = ActsAsTaggableOn::Tag.find_by_sql ["SELECT `tags`.* FROM `tags`"+ - " INNER JOIN `taggings` ON `tags`.`id` = `taggings`.`tag_id` WHERE"+ - "`taggings`.`taggable_id` = #{self.id}"+ - " AND `taggings`.`taggable_type` = 'OpenSourceProject' AND `taggings`.`disagree_num` > 5" + - " AND (taggings.context = 'tags' AND taggings.tagger_id IS NULL) LIMIT #{limit}"] - return tag_list + # tag_list = ActsAsTaggableOn::Tag.find_by_sql ["SELECT `tags`.* FROM `tags`"+ + # " INNER JOIN `taggings` ON `tags`.`id` = `taggings`.`tag_id` WHERE"+ + # "`taggings`.`taggable_id` = #{self.id}"+ + # " AND `taggings`.`taggable_type` = 'OpenSourceProject' AND `taggings`.`disagree_num` > 5" + + # " AND (taggings.context = 'tags' AND taggings.tagger_id IS NULL) LIMIT #{limit}"] + # return tag_list end def get_tag_list_userTag(limit = 9) diff --git a/trustie2/app/views/layouts/_base_footer.html.erb b/trustie2/app/views/layouts/_base_footer.html.erb index 83737e6..1ae371d 100644 --- a/trustie2/app/views/layouts/_base_footer.html.erb +++ b/trustie2/app/views/layouts/_base_footer.html.erb @@ -18,7 +18,7 @@ "http://www.miibeian.gov.cn/", :target => "_blank" %>

- diff --git a/trustie2/app/views/layouts/_base_header.html.erb b/trustie2/app/views/layouts/_base_header.html.erb index 1b7cb83..bed0db1 100644 --- a/trustie2/app/views/layouts/_base_header.html.erb +++ b/trustie2/app/views/layouts/_base_header.html.erb @@ -75,7 +75,7 @@ <% end -%> <%#= render_menu :top_menu if User.current.logged? || !Setting.login_required? -%> - <%= render_dynamic_nav if User.current.logged? || !Setting.login_required? -%> + <%#= render_dynamic_nav if User.current.logged? || !Setting.login_required? -%> <%# 自建导航条在base页面中以 (@nav_dispaly......) 开头变量设定, 全局搜索即可发现 %>
diff --git a/trustie2/app/views/layouts/_base_welcome_header.html.erb b/trustie2/app/views/layouts/_base_welcome_header.html.erb index 7720c12..8f6266b 100644 --- a/trustie2/app/views/layouts/_base_welcome_header.html.erb +++ b/trustie2/app/views/layouts/_base_welcome_header.html.erb @@ -52,7 +52,7 @@
<% if User.current.logged? -%>
@@ -75,7 +75,7 @@ <% end -%> <%#= render_menu :top_menu if User.current.logged? || !Setting.login_required? -%> - <%= render_dynamic_nav if User.current.logged? || !Setting.login_required? -%> + <%#= render_dynamic_nav if User.current.logged? || !Setting.login_required? -%> <%# 自建导航条在base页面中以 (@nav_dispaly......) 开头变量设定, 全局搜索即可发现 %>
diff --git a/trustie2/app/views/open_source_projects/_os_project.html.erb b/trustie2/app/views/open_source_projects/_os_project.html.erb index 226d2cd..5fa3f26 100644 --- a/trustie2/app/views/open_source_projects/_os_project.html.erb +++ b/trustie2/app/views/open_source_projects/_os_project.html.erb @@ -24,9 +24,9 @@

- <%= (project.relative_memos_num.nil?)?0:project.relative_memos_num.to_i %> + - <%= content_tag('span', l(:label_x_relative_topics_count, :count => 0)) %> +

@@ -35,10 +35,10 @@

- <%= short_num (project.view_num_ossean.nil?)?0:project.view_num_ossean %> + <%#= short_num ((project.view_num.nil?)?0:project.view_num)+project.view_num_ossean+project.relative_topics.sum('view_num') %> - <%= content_tag('span', l(:label_view_num)) %> + <%#= content_tag('span', l(:label_view_num)) %>

diff --git a/trustie2/app/views/open_source_projects/_os_project_search_list.html.erb b/trustie2/app/views/open_source_projects/_os_project_search_list.html.erb index 89b927c..98567a5 100644 --- a/trustie2/app/views/open_source_projects/_os_project_search_list.html.erb +++ b/trustie2/app/views/open_source_projects/_os_project_search_list.html.erb @@ -17,16 +17,16 @@
<%#= textilizable(project.short_description, :project => project) %> - <%= show_project_descriptions project.result.description, 300 %> + <%= show_project_descriptions project.description, 300 %>

- + <%#= link_to project_info.followers_num, open_source_project_path(project) %> - <%= (project.result.relative_memos_num.nil?)?0:project.result.relative_memos_num.to_i %> + <%#= (project.result.relative_memos_num.nil?)?0:project.result.relative_memos_num.to_i %> - <%= content_tag('span', l(:label_x_relative_topics_count, :count => 0)) %> + <%#= content_tag('span', l(:label_x_relative_topics_count, :count => 0)) %>

- <%= short_num((project.result.view_num_ossean.nil?)?0:project.result.view_num_ossean)%> + <%#= short_num((project.result.view_num_ossean.nil?)?0:project.result.view_num_ossean)%> <%#= short_num ((project.view_num.nil?)?0:project.view_num)+project.view_num_ossean+project.relative_topics.sum('view_num') %> - <%= content_tag('span', l(:label_view_num)) %> + <%#= content_tag('span', l(:label_view_num)) %>

<%= content_tag('span', "#{l(:label_create_time)}: ") %> - <%= content_tag('span', project.result.created_time.to_s[0,10]) %> + <%= content_tag('span', project.created_time.to_s[0,10]) %>
@@ -99,6 +99,6 @@
<%= image_tag("/images/sidebar/tags.png") %> - <%= render :partial => 'tags/tag_name', :locals => {:obj => project.result, :object_flag => "8", :non_list_all => true} %> + <%= render :partial => 'tags/tag_name', :locals => {:obj => project, :object_flag => "8", :non_list_all => true} %>
diff --git a/trustie2/app/views/open_source_projects/_search_bar.html.erb b/trustie2/app/views/open_source_projects/_search_bar.html.erb index 27e87c1..ba14e4a 100644 --- a/trustie2/app/views/open_source_projects/_search_bar.html.erb +++ b/trustie2/app/views/open_source_projects/_search_bar.html.erb @@ -86,7 +86,7 @@ <%= text_field_tag :name, nil, placeholder:'请输入要搜索的关键字', :size => 27, :class => "topics-search-text-field" %>
<%#= hidden_field_tag 'project_type', project_type %> - <%= submit_tag l(:label_search_topics), :class => "enterprise", :name => nil, :onclick => "user_trace_search_knowledge_in_search_bar('#{request.session_options[:id]}');" %> + <%#= submit_tag l(:label_search_topics), :class => "enterprise", :name => nil, :onclick => "user_trace_search_knowledge_in_search_bar('#{request.session_options[:id]}');" %>
<% end %> <%###################search topics###########################%> diff --git a/trustie2/app/views/open_source_projects/search.html.erb b/trustie2/app/views/open_source_projects/search.html.erb index 60d5ba5..521fa66 100644 --- a/trustie2/app/views/open_source_projects/search.html.erb +++ b/trustie2/app/views/open_source_projects/search.html.erb @@ -207,15 +207,33 @@
@@ -237,7 +255,7 @@
--> -