for merge

This commit is contained in:
starlee 2016-11-17 03:49:54 +08:00
parent fbfdcb08c9
commit a0bbd87f99
63 changed files with 1047 additions and 1876 deletions

12
.gitignore vendored
View File

@ -110,8 +110,10 @@
/gather_program/.settings/
/gather_program/.classpath
#/crawler/dailyScheduledCrawler/fetch_networks/target/
#/crawler/dailyScheduledCrawler/fetch_networks/log/*
#/crawler/daily_scheduler/log/*
#/crawler/moreSmarterCrawler/fetch_networks/target/
#/crawler/moreSmarterCrawler/fetch_networks/log/*
/crawler/dailyScheduledCrawler/fetch_networks/target/
/crawler/dailyScheduledCrawler/fetch_networks/log/*
/crawler/daily_scheduler/log/*
/crawler/daily_scheduler/log_mem/*
/crawler/moreSmarterCrawler/fetch_networks/target/
/crawler/moreSmarterCrawler/fetch_networks/log/*

View File

@ -16,4 +16,4 @@ JVM_ARGS="-Xmn98m -Xmx512m -Xms512m -XX:NewRatio=4 -XX:SurvivorRatio=4 -XX:MaxTe
#echo JVM_ARGS=$JVM_ARGS
#ulimit -n 400000
#echo "" > nohup.out
java $JVM_ARGS -classpath $CLASSPATH org.ossean.gather.process.GatherProcess gather_projects >>log/gather_projects.log 2>&1 &
java $JVM_ARGS -classpath $CLASSPATH org.ossean.gather.process.GatherProcess gather_projects > log/gather_projects.log 2>&1 &

View File

@ -17,4 +17,4 @@ JVM_ARGS="-Xmn98m -Xmx512m -Xms512m -XX:NewRatio=4 -XX:SurvivorRatio=4 -XX:MaxTe
#echo JVM_ARGS=$JVM_ARGS
#ulimit -n 400000
#echo "" > nohup.out
java $JVM_ARGS -classpath $CLASSPATH org.ossean.gather.process.GatherProcess job_requirements >>log/job_requirements.log 2>&1 &
java $JVM_ARGS -classpath $CLASSPATH org.ossean.gather.process.GatherProcess job_requirements > log/job_requirements.log 2>&1 &

View File

@ -17,4 +17,4 @@ JVM_ARGS="-Xmn98m -Xmx2048m -Xms1024m -XX:NewRatio=4 -XX:SurvivorRatio=4 -XX:Max
#echo JVM_ARGS=$JVM_ARGS
#ulimit -n 400000
#echo "" > nohup.out
java $JVM_ARGS -classpath $CLASSPATH org.ossean.gather.process.GatherProcess relative_memos >>log/relative_memos.log 2>&1 &
java $JVM_ARGS -classpath $CLASSPATH org.ossean.gather.process.GatherProcess relative_memos > log/relative_memos.log 2>&1 &

View File

@ -39,9 +39,9 @@
destroy-method="close">
<property name="driverClassName" value="com.mysql.jdbc.Driver" />
<property name="url"
value="jdbc:mysql://172.16.128.30:3306/ossean_production?characterEncoding=UTF-8&amp;zeroDateTimeBehavior=convertToNull&amp;autoReconnect=true" />
value="jdbc:mysql://172.16.128.36:3306/ossean_production?characterEncoding=UTF-8&amp;zeroDateTimeBehavior=convertToNull&amp;autoReconnect=true" />
<property name="username" value="gather" />
<property name="password" value="influx1234" />
<property name="password" value="Influx@1234" />
<property name="validationQuery" value="SELECT 1" />
<property name="testOnBorrow" value="true"/>
</bean>

View File

@ -3,7 +3,7 @@
<properties>
<comment>TableFlow</comment>
<entry key="pointerTableName">pointers</entry>
<entry key="sourceTableName">oschina_question,slashdot,iteye_blog,51cto_blog,csdn_question,cnblog_news,cnblog_question,csdn_bbs,csdn_blogs,dewen_question,stackoverflow</entry>
<entry key="sourceTableName">oschina_question,iteye_blog,51cto_blog,csdn_question,cnblog_news,cnblog_question,csdn_bbs,csdn_blogs,dewen_question,stackoverflow</entry>
<entry key="targetTableName">relative_memos</entry>
<entry key="sourceFields">id,title,content,created_time,now(),type,tags,source,url,url_md5,author,author_url,view_num,review_num,extracted_time</entry>
<entry key="targetFields">id,title,content,created_time,updated_time,memo_type,tags,source,url,url_md5,author,author_url,view_num,review_num,extracted_time</entry>

View File

@ -0,0 +1,29 @@
/*
Navicat MySQL Data Transfer
Source Server : ossean
Source Server Version : 50535
Source Host : 127.0.0.1:3306
Source Database : ossean_new
Target Server Type : MYSQL
Target Server Version : 50535
File Encoding : 65001
Date: 2016-11-15 20:04:15
*/
SET FOREIGN_KEY_CHECKS=0;
-- ----------------------------
-- Table structure for settings
-- ----------------------------
DROP TABLE IF EXISTS `settings`;
CREATE TABLE `settings` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`name` varchar(255) NOT NULL DEFAULT '',
`value` text,
`updated_on` datetime DEFAULT NULL,
PRIMARY KEY (`id`),
KEY `index_settings_on_name` (`name`) USING BTREE
) ENGINE=InnoDB DEFAULT CHARSET=utf8;

View File

@ -0,0 +1,36 @@
/*
Navicat MySQL Data Transfer
Source Server : ossean
Source Server Version : 50535
Source Host : 127.0.0.1:3306
Source Database : ossean_production
Target Server Type : MYSQL
Target Server Version : 50535
File Encoding : 65001
Date: 2016-11-13 22:21:05
*/
SET FOREIGN_KEY_CHECKS=0;
-- ----------------------------
-- Table structure for taggings
-- ----------------------------
DROP TABLE IF EXISTS `taggings`;
CREATE TABLE `taggings` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`tag_id` int(11) NOT NULL,
`taggable_id` int(11) NOT NULL,
`taggable_type` varchar(255) NOT NULL,
`tagger_id` int(11) DEFAULT NULL,
`tagger_type` varchar(255) DEFAULT NULL,
`context` varchar(128) DEFAULT NULL,
`created_at` datetime DEFAULT NULL,
`created_time` datetime DEFAULT NULL,
`disagree_num` int(11) DEFAULT '0',
`tag_source` varchar(255) DEFAULT NULL,
PRIMARY KEY (`id`),
UNIQUE KEY `index_taggings_on_tag_id_and_taggable_id_and_taggable_type` (`tag_id`,`taggable_id`,`taggable_type`) USING BTREE
) ENGINE=InnoDB AUTO_INCREMENT=19315557 DEFAULT CHARSET=utf8;

View File

@ -1,363 +1,363 @@
package org.ossean.gather.process;
import java.util.List;
import javax.annotation.Resource;
import org.apache.log4j.Logger;
import org.ossean.gather.model.Configure;
import org.ossean.gather.model.GatherProject;
import org.ossean.gather.model.JobRequirement;
import org.ossean.gather.model.PKControlPosts;
import org.ossean.gather.model.PKControlProjects;
import org.ossean.gather.model.RelativeMemo;
import org.ossean.gather.model.Taggings;
import org.ossean.gather.sourceDao.GatherDao;
import org.ossean.gather.sourceDao.PKControlPostsDao;
import org.ossean.gather.sourceDao.PKControlProjectsDao;
import org.ossean.gather.targetDao.PointerDao;
import org.ossean.gather.targetDao.TargetDao;
import org.springframework.context.annotation.Scope;
import org.springframework.stereotype.Component;
@Component("gatherThread")
@Scope("prototype")
public class GatherThread implements Runnable {
private static Logger logger = Logger.getLogger(GatherThread.class);
private Configure conf;
@Resource
private GatherDao gatherDao;
@Resource
private PointerDao pointerDao;
@Resource
private PKControlPostsDao pkControlPostsDao;
@Resource
private TargetDao targetDao;
@Resource
private PKControlProjectsDao pkControlProjectsDao;
private int idsBegin; // 转移开始Id值
private int idsEnd; // 转移结束Id值
private int idsIncrement;// 每次转移的Id量
private int beginId;
private int endId;
private String sourceTableName;
private String pkControlPostsTableName = "pk_control_posts";
private String pkControlProjectsTableName = "pk_control_projects";
private String taggingsTableName = "taggings";
private String tagsTableName = "tags";
private String gatherPostsTableName = "relative_memos";
private int maxId;
public void setParameters(Configure conf, String sourceTableName) {
this.conf = conf;
this.sourceTableName = sourceTableName;
}
// 读指针
public int readPointer(String table, String source, String target) {
int pointer = 1;
try {
pointer = pointerDao.getPointer(table, source, target);
} catch (Exception e) {
// 表示表中没有数据
logger.info("No such pointer! Create one");
pointerDao.insertPointer(table, source, target, 1);
}
return pointer;
}
@Override
public void run() {
// long start = System.currentTimeMillis();
Thread.currentThread().setName(sourceTableName);
idsIncrement = conf.getIdsIncrement();
idsBegin = readPointer(conf.getPointerTableName(), sourceTableName,
conf.getTargetTableName());
idsEnd = maxId = gatherDao.getMaxId(sourceTableName);
while (idsBegin < idsEnd) {
beginId = idsBegin;
endId = beginId + idsIncrement - 1; // 取数据时两边都取等号
if (endId <= maxId) {
handleBatchData(beginId, endId, conf);
idsBegin = idsBegin + idsIncrement;
} else {
endId = maxId; // endId应小于maxId
handleBatchData(beginId, endId, conf);
break;
}
}
GatherProcess.gatherState.put(sourceTableName, false);
// long end = System.currentTimeMillis();
// logger.info((end - start) / 6000);
}
public void handleBatchData(int beginId, int endId, Configure conf) {
logger.info("BeginId#" + sourceTableName + ":" + beginId);
// 表示任务没有完成
int maxId = gatherDao.getMaxId(sourceTableName);
// 防止转移超过当前最大值的Id数据
if (beginId >= 0 && endId > 0 && maxId >= endId) {
// 更新执行开始时间
logger.info("begin gathering...");
// 插入Id段数据忽略重复值
try {
String[] sourceFields = conf.getSourceFields().split(",");
String[] targetFields = conf.getTargetFields().split(",");
String selectItems = "";
for (int i = 0; i < sourceFields.length; i++) {
String str_source = sourceFields[i];
String str_target = targetFields[i];
selectItems += str_source + " as " + str_target + ",";
}
selectItems = selectItems
.substring(0, selectItems.length() - 1) + " ";
if (conf.getTargetTableName().equals("relative_memos")) {
List<RelativeMemo> dataGet = gatherDao.getPostGatherData(
sourceTableName, selectItems, beginId, endId,
conf.getAndWhere());
for (int i = 0; i < dataGet.size(); i++) {
RelativeMemo model = dataGet.get(i);
String urlMD5 = model.getUrl_md5();// 通过urlMD5判断是不是已经存在该帖子
// 是否更新
int postId = 0;
if(GatherProcess.urlMd5Set.contains(urlMD5)){
//urlmd5存在则更新
RelativeMemo samePost = targetDao.findPostByUrlMD5(
conf.getTargetTableName(), urlMD5);
// update gather_projects表中对应的记录在维持待更新表
postId = samePost.getId();
model.setId(postId);
handleUpdateGatherPosts(samePost.getId(), model);
}else{
// 不存在 插入
PKControlPosts pkControlModel = pkControlPostsDao
.selectItemByUrlMD5(
pkControlPostsTableName, urlMD5);// 查看有没有固定的id
if (pkControlModel != null)
model.setId(pkControlModel.getId());
else {
// 在pk_control_posts表中生成当前项目对应的id
pkControlPostsDao.insertOneItem(
pkControlPostsTableName, urlMD5);
// 查看刚刚插入信息的id
PKControlPosts controlItem = pkControlPostsDao
.selectItemByUrlMD5(
pkControlPostsTableName, urlMD5);
// 用id构造model对应的固定不变的id
model.setId(controlItem.getId());
postId = model.getId();
}
handleInsertGatherPosts(model, conf);
GatherProcess.urlMd5Set.add(urlMD5);
}
// 将tag和项目的关系存入表item_tag_relation 并分离tag
String tags = model.getTags();
if (tags == null) {
// 表示该项目没有标签
continue;
}
List<String> tagList = DataHandler
.tagsSegmentation(tags);
for (String tag : tagList) {
targetDao.insertTag(tagsTableName, tag);// ignore方式插入该项目的标签
int tag_id = targetDao.selectTagIdByName(
tagsTableName, tag);
Taggings taggings = new Taggings();
taggings.setTag_id(tag_id);
taggings.setTaggable_id(postId);
taggings.setTaggable_type("RelativeMemo");
taggings.setContext("tags");
taggings.setCreated_at(DataHandler.getNow());
// 将Taggings对象存入数据库中
try {
targetDao.insertTaggings(taggingsTableName,
taggings);
} catch (Exception e) {
// 在插入记录之前 relative_memos表中的记录已经被删除掉了
logger.error(e);
System.exit(0);
}
}
}
} else if (conf.getTargetTableName().equals("gather_projects")) {
List<GatherProject> dataGet = gatherDao.getPrjGatherData(
sourceTableName, selectItems, beginId, endId,
conf.getAndWhere());
for (int i = 0; i < dataGet.size(); i++) {
GatherProject model = dataGet.get(i);
String urlMD5 = model.getUrl_md5();// 通过urlMD5判断是不是已经存在该项目
// 是否更新
int prjId = 0;
if(GatherProcess.urlMd5Set.contains(urlMD5)){
GatherProject samePrj = targetDao.findPrjByUrlMD5(
conf.getTargetTableName(), urlMD5);
// update gather_projects表中对应的记录在维持待更新表
prjId = samePrj.getId();
model.setId(prjId);
model.setUpdate_mark(1);
handleUpdateGatherProjects(samePrj.getId(), model);
}else{
// 不存在 插入
PKControlProjects pkControlProjects = pkControlProjectsDao
.selectItemByUrlMD5(
pkControlProjectsTableName, urlMD5);// 查看有没有固定的id
if (pkControlProjects != null)
model.setId(pkControlProjects.getId());
else {
// 在pk_control_posts表中生成当前项目对应的id
pkControlProjectsDao.insertOneItem(
pkControlProjectsTableName, urlMD5);
// 查看刚刚插入信息的id
PKControlProjects controlItem = pkControlProjectsDao
.selectItemByUrlMD5(
pkControlProjectsTableName,
urlMD5);
// 用id构造model对应的固定不变的id
model.setId(controlItem.getId());
prjId = model.getId();
}
model.setUpdate_mark(0);
handleInsertGatherProjects(model, conf);
GatherProcess.urlMd5Set.add(urlMD5);
}
// // 将tag和项目的关系存入表item_tag_relation 并分离tag
// String tags = model.getTags();
// if (tags == null) {
// // 表示该项目没有标签
// continue;
// }
// List<String> tagList = DataHandler
// .tagsSegmentation(tags);
// for (String tag : tagList) {
// targetDao.insertTag(tagsTableName, tag);// ignore方式插入该项目的标签
// int tag_id = targetDao.selectTagIdByName(
// tagsTableName, tag);
// Taggings taggings = new Taggings();
// taggings.setTag_id(tag_id);
// taggings.setTaggable_id(prjId);
// taggings.setTaggable_type("OpenSourceProject");
// taggings.setContext("tags");
// taggings.setCreated_at(DataHandler.getNow());
// // 将Taggings对象存入数据库中
// try {
// targetDao.insertTaggings(taggingsTableName,
// taggings);
// } catch (Exception e) {
// // 在插入记录之前 relative_memos表中的记录已经被删除掉了
// logger.error(e);
// System.exit(0);
// }
// }
}
} else {
List<JobRequirement> dataGet = gatherDao.getJobGatherData(
sourceTableName, selectItems, beginId, endId,
conf.getAndWhere());
for (int i = 0; i < dataGet.size(); i++) {
JobRequirement model = dataGet.get(i);
String urlMD5 = model.getUrl_md5();// 通过urlMD5判断是不是已经存在该帖子
// 是否更新
int postId = 0;
if(GatherProcess.urlMd5Set.contains(urlMD5)){
JobRequirement sameJob = targetDao.findJobByUrlMD5(
conf.getTargetTableName(), urlMD5);
// update gather_projects表中对应的记录在维持待更新表
postId = sameJob.getId();
model.setId(postId);
handleUpdateGatherJobs(sameJob.getId(), model);
}else{
// 不存在 插入
PKControlPosts pkControlPosts = pkControlPostsDao
.selectItemByUrlMD5(
pkControlPostsTableName, urlMD5);// 查看有没有固定的id
if (pkControlPosts != null)
model.setId(pkControlPosts.getId());
else {
// 在pk_control_posts表中生成当前项目对应的id
pkControlPostsDao.insertOneItem(
pkControlPostsTableName, urlMD5);
// 查看刚刚插入信息的id
PKControlPosts controlItem = pkControlPostsDao
.selectItemByUrlMD5(
pkControlPostsTableName, urlMD5);
// 用id构造model对应的固定不变的id
model.setId(controlItem.getId());
postId = model.getId();
}
handleInsertGatherJobs(model, conf);
GatherProcess.urlMd5Set.add(urlMD5);
}
}
}
} catch (Exception ex) {
// 数据迁移过程可能发生异常情况
logger.error(ex);
System.exit(0);
}
// 更新游标到本次 EndId+1;
pointerDao.updatePointer(conf.getPointerTableName(),
sourceTableName, conf.getTargetTableName(), endId + 1);// sourceIdBegin
// +
// idsIncrement
logger.info("current--" + sourceTableName + ": " + endId);
}
}
// 处理URL不存在的帖子 插入relative_memos表
public void handleInsertGatherPosts(RelativeMemo model, Configure conf) {
try {
targetDao.insertRelativeMemo(conf.getTargetTableName(),
conf.getTargetFields(), model);
} catch (Exception e) {
logger.error(e);
}
}
// 处理URL相同的帖子更新 id表示更新的帖子固定id
public void handleUpdateGatherPosts(int id, RelativeMemo model_new) {
targetDao.updateRelativeMemo(gatherPostsTableName, model_new, id);// 更新数据relative_memos表
}
// 处理URL不存在的项目 插入gather_projects表
public void handleInsertGatherProjects(GatherProject model, Configure conf) {
try {
targetDao.insertOpenSourceProject(conf.getTargetTableName(),
conf.getTargetFields(), model);
} catch (Exception e) {
logger.error(e);
}
}
// 处理URL相同的项目更新 id表示更新的项目固定id
public void handleUpdateGatherProjects(int id, GatherProject model_new) {
targetDao.updateOpenSourceProject(conf.getTargetTableName(), model_new,
id);// 更新数据gather_projects表
}
// 处理URL不存在的项目 插入job_requirements表
public void handleInsertGatherJobs(JobRequirement model, Configure conf) {
try {
targetDao.insertJobRequirement(conf.getTargetTableName(),
conf.getTargetFields(), model);
} catch (Exception e) {
logger.error(e);
}
}
// 处理URL相同的项目更新 id表示更新的项目固定id
public void handleUpdateGatherJobs(int id, JobRequirement model_new) {
targetDao
.updateJobRequirement(conf.getTargetTableName(), model_new, id);// 更新数据job_requirements表
}
}
package org.ossean.gather.process;
import java.util.List;
import javax.annotation.Resource;
import org.apache.log4j.Logger;
import org.ossean.gather.model.Configure;
import org.ossean.gather.model.GatherProject;
import org.ossean.gather.model.JobRequirement;
import org.ossean.gather.model.PKControlPosts;
import org.ossean.gather.model.PKControlProjects;
import org.ossean.gather.model.RelativeMemo;
import org.ossean.gather.model.Taggings;
import org.ossean.gather.sourceDao.GatherDao;
import org.ossean.gather.sourceDao.PKControlPostsDao;
import org.ossean.gather.sourceDao.PKControlProjectsDao;
import org.ossean.gather.targetDao.PointerDao;
import org.ossean.gather.targetDao.TargetDao;
import org.springframework.context.annotation.Scope;
import org.springframework.stereotype.Component;
@Component("gatherThread")
@Scope("prototype")
public class GatherThread implements Runnable {
private static Logger logger = Logger.getLogger(GatherThread.class);
private Configure conf;
@Resource
private GatherDao gatherDao;
@Resource
private PointerDao pointerDao;
@Resource
private PKControlPostsDao pkControlPostsDao;
@Resource
private TargetDao targetDao;
@Resource
private PKControlProjectsDao pkControlProjectsDao;
private int idsBegin; // 转移开始Id值
private int idsEnd; // 转移结束Id值
private int idsIncrement;// 每次转移的Id量
private int beginId;
private int endId;
private String sourceTableName;
private String pkControlPostsTableName = "pk_control_posts";
private String pkControlProjectsTableName = "pk_control_projects";
private String memoTaggingsTableName = "memo_taggings";
private String tagsTableName = "tags";
private String gatherPostsTableName = "relative_memos";
private int maxId;
public void setParameters(Configure conf, String sourceTableName) {
this.conf = conf;
this.sourceTableName = sourceTableName;
}
// 读指针
public int readPointer(String table, String source, String target) {
int pointer = 1;
try {
pointer = pointerDao.getPointer(table, source, target);
} catch (Exception e) {
// 表示表中没有数据
logger.info("No such pointer! Create one");
pointerDao.insertPointer(table, source, target, 1);
}
return pointer;
}
@Override
public void run() {
// long start = System.currentTimeMillis();
Thread.currentThread().setName(sourceTableName);
idsIncrement = conf.getIdsIncrement();
idsBegin = readPointer(conf.getPointerTableName(), sourceTableName,
conf.getTargetTableName());
idsEnd = maxId = gatherDao.getMaxId(sourceTableName);
while (idsBegin < idsEnd) {
beginId = idsBegin;
endId = beginId + idsIncrement - 1; // 取数据时两边都取等号
if (endId <= maxId) {
handleBatchData(beginId, endId, conf);
idsBegin = idsBegin + idsIncrement;
} else {
endId = maxId; // endId应小于maxId
handleBatchData(beginId, endId, conf);
break;
}
}
GatherProcess.gatherState.put(sourceTableName, false);
// long end = System.currentTimeMillis();
// logger.info((end - start) / 6000);
}
public void handleBatchData(int beginId, int endId, Configure conf) {
logger.info("BeginId#" + sourceTableName + ":" + beginId);
// 表示任务没有完成
int maxId = gatherDao.getMaxId(sourceTableName);
// 防止转移超过当前最大值的Id数据
if (beginId >= 0 && endId > 0 && maxId >= endId) {
// 更新执行开始时间
logger.info("begin gathering...");
// 插入Id段数据忽略重复值
try {
String[] sourceFields = conf.getSourceFields().split(",");
String[] targetFields = conf.getTargetFields().split(",");
String selectItems = "";
for (int i = 0; i < sourceFields.length; i++) {
String str_source = sourceFields[i];
String str_target = targetFields[i];
selectItems += str_source + " as " + str_target + ",";
}
selectItems = selectItems
.substring(0, selectItems.length() - 1) + " ";
if (conf.getTargetTableName().equals("relative_memos")) {
List<RelativeMemo> dataGet = gatherDao.getPostGatherData(
sourceTableName, selectItems, beginId, endId,
conf.getAndWhere());
for (int i = 0; i < dataGet.size(); i++) {
RelativeMemo model = dataGet.get(i);
String urlMD5 = model.getUrl_md5();// 通过urlMD5判断是不是已经存在该帖子
// 是否更新
int postId = 0;
if(GatherProcess.urlMd5Set.contains(urlMD5)){
//urlmd5存在则更新
RelativeMemo samePost = targetDao.findPostByUrlMD5(
conf.getTargetTableName(), urlMD5);
// update gather_projects表中对应的记录在维持待更新表
postId = samePost.getId();
model.setId(postId);
handleUpdateGatherPosts(samePost.getId(), model);
}else{
// 不存在 插入
PKControlPosts pkControlModel = pkControlPostsDao
.selectItemByUrlMD5(
pkControlPostsTableName, urlMD5);// 查看有没有固定的id
if (pkControlModel != null)
model.setId(pkControlModel.getId());
else {
// 在pk_control_posts表中生成当前项目对应的id
pkControlPostsDao.insertOneItem(
pkControlPostsTableName, urlMD5);
// 查看刚刚插入信息的id
PKControlPosts controlItem = pkControlPostsDao
.selectItemByUrlMD5(
pkControlPostsTableName, urlMD5);
// 用id构造model对应的固定不变的id
model.setId(controlItem.getId());
postId = model.getId();
}
handleInsertGatherPosts(model, conf);
GatherProcess.urlMd5Set.add(urlMD5);
}
// 将tag和项目的关系存入表item_tag_relation 并分离tag
String tags = model.getTags();
if (tags == null) {
// 表示该项目没有标签
continue;
}
List<String> tagList = DataHandler
.tagsSegmentation(tags);
for (String tag : tagList) {
targetDao.insertTag(tagsTableName, tag);// ignore方式插入该项目的标签
int tag_id = targetDao.selectTagIdByName(
tagsTableName, tag);
Taggings taggings = new Taggings();
taggings.setTag_id(tag_id);
taggings.setTaggable_id(postId);
taggings.setTaggable_type("RelativeMemo");
taggings.setContext("tags");
taggings.setCreated_at(DataHandler.getNow());
// 将Taggings对象存入数据库中
try {
targetDao.insertTaggings(memoTaggingsTableName,
taggings);
} catch (Exception e) {
// 在插入记录之前 relative_memos表中的记录已经被删除掉了
logger.error(e);
System.exit(0);
}
}
}
} else if (conf.getTargetTableName().equals("gather_projects")) {
List<GatherProject> dataGet = gatherDao.getPrjGatherData(
sourceTableName, selectItems, beginId, endId,
conf.getAndWhere());
for (int i = 0; i < dataGet.size(); i++) {
GatherProject model = dataGet.get(i);
String urlMD5 = model.getUrl_md5();// 通过urlMD5判断是不是已经存在该项目
// 是否更新
int prjId = 0;
if(GatherProcess.urlMd5Set.contains(urlMD5)){
GatherProject samePrj = targetDao.findPrjByUrlMD5(
conf.getTargetTableName(), urlMD5);
// update gather_projects表中对应的记录在维持待更新表
prjId = samePrj.getId();
model.setId(prjId);
model.setUpdate_mark(2);
handleUpdateGatherProjects(samePrj.getId(), model);
}else{
// 不存在 插入
PKControlProjects pkControlProjects = pkControlProjectsDao
.selectItemByUrlMD5(
pkControlProjectsTableName, urlMD5);// 查看有没有固定的id
if (pkControlProjects != null)
model.setId(pkControlProjects.getId());
else {
// 在pk_control_posts表中生成当前项目对应的id
pkControlProjectsDao.insertOneItem(
pkControlProjectsTableName, urlMD5);
// 查看刚刚插入信息的id
PKControlProjects controlItem = pkControlProjectsDao
.selectItemByUrlMD5(
pkControlProjectsTableName,
urlMD5);
// 用id构造model对应的固定不变的id
model.setId(controlItem.getId());
prjId = model.getId();
}
model.setUpdate_mark(0);
handleInsertGatherProjects(model, conf);
GatherProcess.urlMd5Set.add(urlMD5);
}
// // 将tag和项目的关系存入表item_tag_relation 并分离tag
// String tags = model.getTags();
// if (tags == null) {
// // 表示该项目没有标签
// continue;
// }
// List<String> tagList = DataHandler
// .tagsSegmentation(tags);
// for (String tag : tagList) {
// targetDao.insertTag(tagsTableName, tag);// ignore方式插入该项目的标签
// int tag_id = targetDao.selectTagIdByName(
// tagsTableName, tag);
// Taggings taggings = new Taggings();
// taggings.setTag_id(tag_id);
// taggings.setTaggable_id(prjId);
// taggings.setTaggable_type("OpenSourceProject");
// taggings.setContext("tags");
// taggings.setCreated_at(DataHandler.getNow());
// // 将Taggings对象存入数据库中
// try {
// targetDao.insertTaggings(taggingsTableName,
// taggings);
// } catch (Exception e) {
// // 在插入记录之前 relative_memos表中的记录已经被删除掉了
// logger.error(e);
// System.exit(0);
// }
// }
}
} else {
List<JobRequirement> dataGet = gatherDao.getJobGatherData(
sourceTableName, selectItems, beginId, endId,
conf.getAndWhere());
for (int i = 0; i < dataGet.size(); i++) {
JobRequirement model = dataGet.get(i);
String urlMD5 = model.getUrl_md5();// 通过urlMD5判断是不是已经存在该帖子
// 是否更新
int postId = 0;
if(GatherProcess.urlMd5Set.contains(urlMD5)){
JobRequirement sameJob = targetDao.findJobByUrlMD5(
conf.getTargetTableName(), urlMD5);
// update gather_projects表中对应的记录在维持待更新表
postId = sameJob.getId();
model.setId(postId);
handleUpdateGatherJobs(sameJob.getId(), model);
}else{
// 不存在 插入
PKControlPosts pkControlPosts = pkControlPostsDao
.selectItemByUrlMD5(
pkControlPostsTableName, urlMD5);// 查看有没有固定的id
if (pkControlPosts != null)
model.setId(pkControlPosts.getId());
else {
// 在pk_control_posts表中生成当前项目对应的id
pkControlPostsDao.insertOneItem(
pkControlPostsTableName, urlMD5);
// 查看刚刚插入信息的id
PKControlPosts controlItem = pkControlPostsDao
.selectItemByUrlMD5(
pkControlPostsTableName, urlMD5);
// 用id构造model对应的固定不变的id
model.setId(controlItem.getId());
postId = model.getId();
}
handleInsertGatherJobs(model, conf);
GatherProcess.urlMd5Set.add(urlMD5);
}
}
}
} catch (Exception ex) {
// 数据迁移过程可能发生异常情况
logger.error(ex);
System.exit(0);
}
// 更新游标到本次 EndId+1;
pointerDao.updatePointer(conf.getPointerTableName(),
sourceTableName, conf.getTargetTableName(), endId + 1);// sourceIdBegin
// +
// idsIncrement
logger.info("current--" + sourceTableName + ": " + endId);
}
}
// 处理URL不存在的帖子 插入relative_memos表
public void handleInsertGatherPosts(RelativeMemo model, Configure conf) {
try {
targetDao.insertRelativeMemo(conf.getTargetTableName(),
conf.getTargetFields(), model);
} catch (Exception e) {
logger.error(e);
}
}
// 处理URL相同的帖子更新 id表示更新的帖子固定id
public void handleUpdateGatherPosts(int id, RelativeMemo model_new) {
targetDao.updateRelativeMemo(gatherPostsTableName, model_new, id);// 更新数据relative_memos表
}
// 处理URL不存在的项目 插入gather_projects表
public void handleInsertGatherProjects(GatherProject model, Configure conf) {
try {
targetDao.insertOpenSourceProject(conf.getTargetTableName(),
conf.getTargetFields(), model);
} catch (Exception e) {
logger.error(e);
}
}
// 处理URL相同的项目更新 id表示更新的项目固定id
public void handleUpdateGatherProjects(int id, GatherProject model_new) {
targetDao.updateOpenSourceProject(conf.getTargetTableName(), model_new,
id);// 更新数据gather_projects表
}
// 处理URL不存在的项目 插入job_requirements表
public void handleInsertGatherJobs(JobRequirement model, Configure conf) {
try {
targetDao.insertJobRequirement(conf.getTargetTableName(),
conf.getTargetFields(), model);
} catch (Exception e) {
logger.error(e);
}
}
// 处理URL相同的项目更新 id表示更新的项目固定id
public void handleUpdateGatherJobs(int id, JobRequirement model_new) {
targetDao
.updateJobRequirement(conf.getTargetTableName(), model_new, id);// 更新数据job_requirements表
}
}

View File

@ -36,9 +36,9 @@
destroy-method="close">
<property name="driverClassName" value="com.mysql.jdbc.Driver" />
<property name="url"
value="jdbc:mysql://172.16.128.30:3306/ossean_production?characterEncoding=UTF-8&amp;zeroDateTimeBehavior=convertToNull&amp;autoReconnect=true" />
value="jdbc:mysql://172.16.128.36:3306/ossean_production?characterEncoding=UTF-8&amp;zeroDateTimeBehavior=convertToNull&amp;autoReconnect=true" />
<property name="username" value="gather" />
<property name="password" value="influx1234" />
<property name="password" value="Influx@1234" />
<property name="initialSize" value="5" />
<property name="maxActive" value="100" />
</bean>

View File

@ -17,7 +17,6 @@ import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
@ -36,11 +35,9 @@ import org.wltea.analyzer.lucene.IKAnalyzer;
import com.ossean.match.dao.ProjectDao;
import com.ossean.match.dao.RelativeMemoDao;
import com.ossean.match.matchprocess.CountFrequency;
import com.ossean.match.matchprocess.MatchIncrement;
import com.ossean.match.model.Project;
import com.ossean.match.model.RelativeMemo;
import com.ossean.match.model.Tag;
import com.ossean.match.utils.Normalizer;
@Component("luceneindex")
@ -84,36 +81,6 @@ public class LuceneIndex {
iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
IndexWriter iw = new IndexWriter(dire, iwc);
return iw;
}
public static void buildTagIndex(List<Tag> tags) {
Logger logger1 = LoggerFactory.getLogger(LuceneIndex.class);
Directory dir;
Analyzer analyzer = new IKAnalyzer(true);
try {
dir = FSDirectory.open(Paths.get(CountFrequency.tagIndexDir));
IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
IndexWriter writer = new IndexWriter(dir, iwc);
for (Tag tag : tags) {
// 对标签建立索引
Document doc = new Document();
String name = tag.getName().toLowerCase();
// 标签原始名字进行存储
doc.add(new StringField("name", "" + name, Field.Store.YES));
doc.add(new StringField("id", "" + tag.getId(), Field.Store.YES));
// 对标签进行分词
doc.add(new TextField("items", Normalizer.normalize(name),
Field.Store.NO));
writer.addDocument(doc);
}
writer.close();
} catch (IOException e) {
logger1.error("buildTagIndex IOException: " + e);
}
// Analyzer analyzer = new SimpleAnalyzer();
}
/**
@ -172,9 +139,6 @@ public class LuceneIndex {
endId += step;
}
}
matchIncrement.setPrjHistory(0);
matchIncrement.setPrjIndexed(maxPrjId);
matchIncrement.writeIntoRecord();
writer.commit();
writer.close();
}
@ -225,9 +189,7 @@ public class LuceneIndex {
endId += step;
}
}
matchIncrement.setPrjHistory(0);
matchIncrement.setMemoHistory(lastMemoId);
matchIncrement.setMemoIndexed(lastMemoId);
matchIncrement.writeIntoRecord();
/*FileOutputStream fout = new FileOutputStream(new File("record.txt"));
fout.write((0 + "\t" + lastMemoId + "\t" + 0 + "\t" + 0).getBytes()); //初始化record.txt项目初始id为0帖子初始id为创建索引的最后一个帖子的id
@ -256,7 +218,7 @@ public class LuceneIndex {
continue;
}
Document doc = new Document();
doc.add(new StringField(memoIdFieldName, String.valueOf(rsId), Store.YES));
doc.add(new StringField(memoIdFieldName, String.valueOf(rsId) + "", Store.YES));
doc.add(new TextField(titleFieldName, memo.getTitle(), Store.NO));
String memoTagsString = memo.getTags();
if (memoTagsString == null) {
@ -266,7 +228,7 @@ public class LuceneIndex {
for(String memoTag : memoTagsList){
doc.add(new StringField(memoTagsFieldName, memoTag, Store.NO));
}
indexWriter.addDocument(doc);
indexWriter.updateDocument(new Term(memoIdFieldName, String.valueOf(rsId) + ""), doc);
}
indexWriter.commit();
indexWriter.close();

View File

@ -7,11 +7,8 @@ import java.util.List;
import javax.annotation.Resource;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Explanation;
@ -24,10 +21,8 @@ import org.apache.lucene.search.similarities.DefaultSimilarity;
import org.apache.lucene.search.similarities.Similarity;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.wltea.analyzer.lucene.IKAnalyzer;
import com.ossean.match.dao.ProjectDao;
import com.ossean.match.model.Project;
import com.ossean.match.utils.Normalizer;
public class LuceneSearch {
@ -89,7 +84,7 @@ public class LuceneSearch {
}
query.add(tq, BooleanClause.Occur.SHOULD);
}
TopDocs td = is.search(query, 3);
TopDocs td = is.search(query, 10000);
ScoreDoc[] sds = td.scoreDocs;
for (ScoreDoc sd : sds) {
Document d = is.doc(sd.doc);
@ -98,10 +93,6 @@ public class LuceneSearch {
for(String prjName : prjNames){
if (keyWords.contains(prjName)) {
int pId = Integer.parseInt(prjId);
Project currentPrj = projectDao.getPrjById(pId);
if(currentPrj.getFiltration()==0){
}
if (matchMap.containsKey(pId)) {
matchMap.put(pId, matchMap.get(pId) + weight + sd.score/1000);
} else
@ -130,59 +121,45 @@ public class LuceneSearch {
*/
public static HashMap<Integer, Double> prjToMemoMatchByLucene(
String prjName, String searchField, double weight,
HashMap<Integer, Double> map, IndexReader memoIndexReader, IndexReader prjIndexReader) {
HashMap<Integer, Double> map, IndexReader memoIndexReader) {
try {
IndexSearcher is = new IndexSearcher(memoIndexReader);
List<String> prjNameList = Normalizer.getList(prjName);
BooleanQuery query = new BooleanQuery();
// Similarity similarity = new DefaultSimilarity(){
// @Override
// public float queryNorm(float sumOfSquaredWeights) {
// return 1.0f;
// }
// @Override
// public float lengthNorm(FieldInvertState state) {
// return 1.0f;
// }
// };
// is.setSimilarity(similarity);
for(String prjNameTerm : prjNameList){
//Term termForFreq = new Term(LuceneIndex.prjNameFieldName, prjNameTerm);
Term term = new Term(searchField, prjNameTerm);
TermQuery tq = new TermQuery(term);
// double curTermDocFreq = prjIndexReader.docFreq(termForFreq);
// if (prjNameList.size() > 1 && curTermDocFreq > 10) {
// tq.setBoost((float) (1/(curTermDocFreq/10)));
// }
// else {
// tq.setBoost(1.1f);
// }
query.add(tq, BooleanClause.Occur.MUST);
query.add(tq, BooleanClause.Occur.MUST); //项目名分词后的每个term都必须在帖子中出现
}
TopDocs td = is.search(query, 1000000);
ScoreDoc[] sds = td.scoreDocs;
for (ScoreDoc sd : sds) {
Document d = is.doc(sd.doc);
// if (sd.score >= 0.7) {
String postId = d.get(LuceneIndex.memoIdFieldName);
int pId = Integer.parseInt(postId);
if (map.containsKey(pId)) {
map.put(pId, map.get(pId) + weight + sd.score/1000);
} else
map.put(pId, weight + sd.score/1000);
// }
}
} catch (IOException e) {
logger.error("prjToMemoMatchByLucene IOException: " + e);
} /*catch (ParseException e) {
logger.error("prjToMemoMatchByLucene ParseException: " + e);
}*/
}
return map;
}
public static HashMap<Integer, Double> searchMemoTags(String tagStr, String searchField, double weight,
/**
* 项目名和项目别名与帖子标签之间的匹配
* @param tagStr
* @param searchField
* @param weight
* @param map
* @param indexReader
* @return
*/
public static HashMap<Integer, Double> searchMemoTags(String tagStr, String searchField, String idField, double weight,
HashMap<Integer, Double> map, IndexReader indexReader) {
try {
@ -193,11 +170,11 @@ public class LuceneSearch {
ScoreDoc[] sds = td.scoreDocs;
for (ScoreDoc sd : sds) {
Document d = is.doc(sd.doc);
String postId = d.get(LuceneIndex.memoIdFieldName);
String postId = d.get(idField);
int pId = Integer.parseInt(postId);
if (map.containsKey(pId)) {
if (map.containsKey(pId)) {
map.put(pId, map.get(pId) + weight);
} else {
} else {
map.put(pId, weight);
}
}
@ -207,6 +184,16 @@ public class LuceneSearch {
return map;
}
/**
* 项目标签与帖子标签
* @param tagStr
* @param searchField
* @param idField
* @param weight
* @param map
* @param indexReader
* @return
*/
public static HashMap<Integer, Double> searchByPrjTag(String tagStr, String searchField, String idField, double weight,
HashMap<Integer, Double> map, IndexReader indexReader) {
@ -220,7 +207,7 @@ public class LuceneSearch {
Document d = is.doc(sd.doc);
String postId = d.get(idField);
int pId = Integer.parseInt(postId);
if (map.containsKey(pId)) {
if (map.containsKey(pId)) { //有项目名和项目别名的匹配时才加入标签匹配的结果
map.put(pId, map.get(pId) + weight);
}
}
@ -230,6 +217,16 @@ public class LuceneSearch {
return map;
}
/**
* 项目标签搜索帖子标题
* @param tagStr
* @param searchField
* @param idField
* @param weight
* @param map
* @param indexReader
* @return
*/
public static HashMap<Integer, Double> searchByPrjTagInMemoTitle(String tagStr, String searchField, String idField, double weight,
HashMap<Integer, Double> map, IndexReader indexReader) {
@ -260,34 +257,34 @@ public class LuceneSearch {
// 获得每个帖子匹配到的标签个数
public static HashMap<Integer, Integer> tagsMatch(String idField,
String tags, String searchField, IndexReader indexReader) {
HashMap<Integer, Integer> tagsMatchNum = new HashMap<Integer, Integer>();
try {
IndexSearcher is = new IndexSearcher(indexReader);
QueryParser parser = new QueryParser(searchField, new IKAnalyzer(true));
Query query = parser.parse(tags);
TopDocs td = is.search(query, 100000);
ScoreDoc[] sds = td.scoreDocs;
for (ScoreDoc sd : sds) {
Document d = is.doc(sd.doc);
String postId = d.get(idField);
int pId = Integer.parseInt(postId);
Explanation explanation = is.explain(query, sd.doc);
int hitNum = getHitTermsNum(explanation);
if (tagsMatchNum.containsKey(pId)) {
tagsMatchNum.put(pId, tagsMatchNum.get(pId) + hitNum);
} else
tagsMatchNum.put(pId, hitNum);
}
} catch (IOException e) {
logger.error("tagsMatch IOException: " + e);
} catch (ParseException e) {
logger.error("tagsMatch ParseException: " + e);
}
return tagsMatchNum;
}
// public static HashMap<Integer, Integer> tagsMatch(String idField,
// String tags, String searchField, IndexReader indexReader) {
// HashMap<Integer, Integer> tagsMatchNum = new HashMap<Integer, Integer>();
// try {
// IndexSearcher is = new IndexSearcher(indexReader);
// QueryParser parser = new QueryParser(searchField, new IKAnalyzer(true));
// Query query = parser.parse(tags);
// TopDocs td = is.search(query, 100000);
// ScoreDoc[] sds = td.scoreDocs;
// for (ScoreDoc sd : sds) {
// Document d = is.doc(sd.doc);
// String postId = d.get(idField);
// int pId = Integer.parseInt(postId);
// Explanation explanation = is.explain(query, sd.doc);
// int hitNum = getHitTermsNum(explanation);
// if (tagsMatchNum.containsKey(pId)) {
// tagsMatchNum.put(pId, tagsMatchNum.get(pId) + hitNum);
// } else
// tagsMatchNum.put(pId, hitNum);
// }
// } catch (IOException e) {
// logger.error("tagsMatch IOException: " + e);
// } catch (ParseException e) {
// logger.error("tagsMatch ParseException: " + e);
// }
//
// return tagsMatchNum;
// }
//get the number of terms hitted in docs
public static int getHitTermsNum(Explanation explanation){

View File

@ -9,7 +9,6 @@ import org.springframework.context.support.ClassPathXmlApplicationContext;
import org.springframework.stereotype.Component;
import com.ossean.match.lucene.LuceneIndex;
import com.ossean.match.matchprocess.CountFrequency;
import com.ossean.match.matchprocess.MatchIncrement;
import com.ossean.match.matchprocess.MemoToPrjMatch;
import com.ossean.match.matchprocess.NewPrjMonitor;
@ -29,10 +28,6 @@ public class Main {
@Autowired
private PrjToMemoMatch prjToMemoMatch;
@Qualifier("countfrequency")
@Autowired
private CountFrequency countFrequency;
@Qualifier("luceneindex")
@Autowired
private LuceneIndex luceneIndex;
@ -49,31 +44,17 @@ public class Main {
public void start() throws InterruptedException, ParseException {
while (true) {
MatchIncrement matchIncrement = new MatchIncrement();
if (matchIncrement.getTaskToPrjId() != 0) { //判断是否设置了截止项目id
prjToMemoMatch.setMatchIncrement(matchIncrement);
countFrequency.setMatchIncrement(matchIncrement);
if (matchIncrement.getPrjHistory() == 0 && matchIncrement.getMemoHistory() == 0) {
luceneIndex.run(matchIncrement);
}
newPrjMonitor.run();
countFrequency.run();
prjToMemoMatch.run();
}
else {
newPrjMonitor.setMatchIncrement(matchIncrement);
memoToPrjMatch.setMatchIncrement(matchIncrement);
prjToMemoMatch.setMatchIncrement(matchIncrement);
countFrequency.setMatchIncrement(matchIncrement);
if (matchIncrement.getPrjHistory() == 0 && matchIncrement.getMemoHistory() == 0) {
luceneIndex.run(matchIncrement);
}
newPrjMonitor.run();
countFrequency.run();
prjToMemoMatch.run();
memoToPrjMatch.run();
newPrjMonitor.setMatchIncrement(matchIncrement);
memoToPrjMatch.setMatchIncrement(matchIncrement);
prjToMemoMatch.setMatchIncrement(matchIncrement);
if (matchIncrement.getMemoHistory() == 0) {
luceneIndex.run(matchIncrement);
}
newPrjMonitor.run();
prjToMemoMatch.run();
memoToPrjMatch.run();
if (matchIncrement.getSleepTime() > 0) {
logger.info(".........sleeping.........." + matchIncrement.getSleepTime()/1000 + "s......");
}

View File

@ -1,364 +0,0 @@
package com.ossean.match.matchprocess;
import java.io.IOException;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import javax.annotation.Resource;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Component;
import org.wltea.analyzer.lucene.IKAnalyzer;
import com.ossean.match.dao.AtomicItemDao;
import com.ossean.match.dao.ProjectDao;
import com.ossean.match.dao.TagDao;
import com.ossean.match.lucene.LuceneIndex;
import com.ossean.match.model.Atom;
import com.ossean.match.model.Project;
import com.ossean.match.model.Tag;
import com.ossean.match.pipeline.AtomicItemPipeline;
import com.ossean.match.pipeline.ResultPipeline;
import com.ossean.match.utils.Extractor;
import com.ossean.match.utils.Normalizer;
import com.ossean.match.utils.SimilarityCounter;
@Component("countfrequency")
public class CountFrequency {
@Resource
private AtomicItemPipeline atomicItemPipeline;
@Resource
private AtomicItemDao atomicItemDao;
@Resource
private ProjectDao projectDao;
@Resource
private TagDao tagDao;
@Resource
private ResultPipeline resultPipeline;
public static List<Atom> atomList;
// = atomicItemDao.selectAllAtom();
public static Map<String, ArrayList<Integer>> atoms;
// = transformAtoms(atomList);
public static String tagIndexDir = "tagIndexDir";
private Logger logger = LoggerFactory.getLogger(getClass());
MatchIncrement matchIncrement = null;
public void setMatchIncrement(MatchIncrement matchIncrement) {
this.matchIncrement = matchIncrement;
}
private static Map<String, ArrayList<Integer>> transformAtoms(
List<Atom> atomList) {
// TODO Auto-generated method stub
ArrayList<Integer> nums = null;
Map<String, ArrayList<Integer>> atoms = new HashMap<String, ArrayList<Integer>>();
String name = null;
for (Atom atom : atomList) {
nums = new ArrayList<Integer>(2);
nums.add(atom.getNumTag());
nums.add(atom.getNumInProj());
nums.add(atom.getStatus());
name = atom.getName();
atoms.put(name, nums);
}
return atoms;
}
private List<Atom> retransformAtoms(Map<String, ArrayList<Integer>> atoms) {
List<Atom> atomList = new ArrayList<Atom>();
Set<String> items = atoms.keySet();
for (String item : items) {
Atom atom = new Atom();
atom.setName(item);
atom.setNumInTag(atoms.get(item).get(0));
atom.setNumInProj(atoms.get(item).get(1));
atom.setStatus(atoms.get(item).get(2));
atomList.add(atom);
}
return atomList;
}
private void extractTags(List<Tag> tags,
Map<String, ArrayList<Integer>> atoms) {
for (Tag tag : tags) {
List<String> items = Extractor.extractAtoms(tag.getName());
logger.info("extract tag : " + tag.getId() + ","
+ tag.getName() + ">>" + items);
saveAtoms(items, atoms, 0);
}
}
private void extractProjects(List<Project> projects,
Map<String, ArrayList<Integer>> atoms) {
for (Project project : projects) {
List<String> items = Extractor.extractAtoms(project.getName());
logger.info("extract project : " + project.getId() + ","
+ project.getName() + ">>" + items);
saveAtoms(items, atoms, 1);
}
}
/**
* @param items 抽取元素
* @param atoms 存储抽取到的元素
* @param pos 用来标注项目和标签1为项目0为标签
*/
private void saveAtoms(List<String> items,
Map<String, ArrayList<Integer>> atoms, int pos) {
ArrayList<Integer> nums = null;
for (String item : items) {
if (atoms.containsKey(item)) {
nums = atoms.get(item);
if(nums.get(2) == 0) {
nums.set(2, 1);
}
} else {
nums = new ArrayList<Integer>(2);
nums.add(0);
nums.add(0);
nums.add(2);
}
nums.set(pos, nums.get(pos) + 1);
atoms.put(item, nums);
}
}
// 对项目(projName)和标签进行匹配并将匹配结果存入数据库
private List<String> queryMatch(String projName, int projId, Map<String, ArrayList<Integer>> atoms)
throws ParseException {
Directory dir;
Analyzer analyzer = new IKAnalyzer(true);
List<String> resultStrings = new ArrayList<String>();
try {
dir = FSDirectory.open(Paths.get(CountFrequency.tagIndexDir));
IndexReader reader = DirectoryReader.open(dir);
IndexSearcher searcher = new IndexSearcher(reader);
QueryParser parser = new QueryParser("items", analyzer);
Query query = null;
if (projName != null && !"".equals(Normalizer.normalize(projName))) {
query = parser.parse(QueryParser.escape(Normalizer
.normalize(projName)));
TopDocs hits = searcher.search(query, 1000);
if (hits.scoreDocs != null) {
for (ScoreDoc sc : hits.scoreDocs) {
int docNum = sc.doc;
int tagId = Integer.parseInt(searcher.doc(docNum).get(
"id"));
String tagName = searcher.doc(docNum).get("name");
// System.out.println("match: " + projId + " - " + tagId
// +
// "("
// + tagName + ")");
// System.out.println("completely matched tag >>> id: "
// + tagId + ",name: " + tagOriginalName);
List<String> itemsOfPrj = Extractor
.extractAtoms(projName);
List<String> itemsOfTag = Extractor
.extractAtoms(tagName);
int prjWeight[] = new int[itemsOfPrj.size()];
int tagWeight[] = new int[itemsOfTag.size()];
int i = 0;
for (String item : itemsOfPrj) {
prjWeight[i++] = atoms.get(item).get(1);
}
i = 0;
for (String item : itemsOfTag) {
tagWeight[i++] = atoms.get(item).get(0);
}
float score = SimilarityCounter.countSimilarity(
itemsOfPrj, prjWeight, itemsOfTag, tagWeight);
float EPSINON = 0.999F;
if (!((score >= -EPSINON) && (score <= EPSINON))) {
resultPipeline.insertResult3(projId, projName,
tagId, tagName, score);
resultStrings.add(tagName);
}
}
}
}
} catch (IOException e) {
e.printStackTrace();
}
return resultStrings;
}
// 对项目(projName)和新标签进行匹配并将匹配结果存入数据库
private List<String> queryMatchNewTag(String tagName, int tagId, Map<String, ArrayList<Integer>> atoms)
throws ParseException {
// Directory dir;
tagName = tagName.trim();
Analyzer analyzer = new IKAnalyzer(true);
List<String> resultStrings = new ArrayList<String>();
try {
Directory dire = FSDirectory.open(Paths
.get(LuceneIndex.PRJS_INDEX_PATH));
IndexReader indexReader = DirectoryReader.open(dire);
// dir = FSDirectory.open(Paths.get(CountFrequency.tagIndexDir));
// IndexReader reader = DirectoryReader.open(dir);
IndexSearcher searcher = new IndexSearcher(indexReader);
QueryParser parser = new QueryParser("items", analyzer);
Query query = null;
if (tagName != null && !"".equals(Normalizer.normalize(tagName))) {
query = parser.parse(QueryParser.escape(Normalizer
.normalize(tagName)));
TopDocs hits = searcher.search(query, 1000);
if (hits.scoreDocs != null) {
for (ScoreDoc sc : hits.scoreDocs) {
int docNum = sc.doc;
int projId = Integer.parseInt(searcher.doc(docNum).get(
LuceneIndex.prjIdFieldName));
String projName = searcher.doc(docNum).get(
LuceneIndex.prjNameFieldName);
// System.out.println("match: " + projId + " - " + tagId
// +
// "("
// + tagName + ")");
// System.out.println("completely matched tag >>> id: "
// + tagId + ",name: " + tagOriginalName);
List<String> itemsOfPrj = Extractor
.extractAtoms(projName);
List<String> itemsOfTag = Extractor
.extractAtoms(tagName);
int prjWeight[] = new int[itemsOfPrj.size()];
int tagWeight[] = new int[itemsOfTag.size()];
int i = 0;
for (String item : itemsOfPrj) {
prjWeight[i++] = atoms.get(item).get(1);
}
i = 0;
for (String item : itemsOfTag) {
tagWeight[i++] = atoms.get(item).get(0);
}
float score = SimilarityCounter.countSimilarity(
itemsOfPrj, prjWeight, itemsOfTag, tagWeight);
float EPSINON = 0.999F;
if (!((score >= -EPSINON) && (score <= EPSINON))) {
// TODO pipeline
resultPipeline.insertResult3(projId, projName,
tagId, tagName, score);
resultStrings.add(projName);
}
}
}
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return resultStrings;
}
public void run() throws ParseException {
atomList = atomicItemDao.selectAllAtom();
atoms = transformAtoms(atomList);
if (matchIncrement.getPrePrjHistory() == 0
&& matchIncrement.getTagHistory() == 0) {
List<Tag> tags = tagDao.getBatchTag(matchIncrement.getTagHistory(),
matchIncrement.getLastTagId());
// 建立索引
LuceneIndex.buildTagIndex(tags);
List<Project> projects = projectDao.getBatchPrjsIncre(2000000);
// 抽取项目
extractProjects(projects, atoms);
matchIncrement.getMatchRecord();
matchIncrement.setPrePrjHistory(matchIncrement.getLastPrjId());
// 抽取标签
extractTags(tags, atoms);
matchIncrement.setTagHistory(matchIncrement.getLastTagId());
// atomList.clear();
// atomList = retransformAtoms(atoms);
// atomicItemPipeline.pipelineAtoms(atomList);
// 新项目与标签匹配
for (Project project : projects) {
queryMatch(project.getName().trim(),
project.getId(), atoms);
}
matchIncrement.writeIntoRecord();
} else {
if (matchIncrement.isNewPrePrjFlag()) {
logger.info("start count frequency of projects match!!!");
List<Project> projects = projectDao.getBatchPrjs(
matchIncrement.getPrjHistory(),
matchIncrement.getLastPrjId());
// 抽取项目
extractProjects(projects, atoms);
matchIncrement.setPrePrjHistory(matchIncrement
.getLastPrjId());
// 新项目与标签匹配
for (Project project : projects) {
List<String> tags = new ArrayList<String>();
tags = queryMatch(project.getName().trim(),
project.getId(),atoms);
}
matchIncrement.writeIntoRecord();
}
if (matchIncrement.isNewTagflag()) {
logger.info("start count frequency of projects match!!!");
List<Tag> tags = tagDao.getBatchTag(
matchIncrement.getTagHistory(),
matchIncrement.getLastTagId());
// 建立索引
LuceneIndex.buildTagIndex(tags);
// 抽取项目
extractTags(tags, atoms);
matchIncrement.setTagHistory(matchIncrement.getLastTagId());
// 新标签与项目匹配
for (Tag tag : tags) {
List<String> projects = new ArrayList<String>();
projects = queryMatchNewTag(tag.getName().trim(),
tag.getId(), atoms);
}
matchIncrement.writeIntoRecord();
}
}
atomList.clear();
atomList = retransformAtoms(atoms);
atomicItemPipeline.pipelineAtoms(atomList);
}
}

View File

@ -5,7 +5,6 @@ import java.nio.file.Paths;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
@ -24,7 +23,6 @@ import com.ossean.match.lucene.LuceneSearch;
import com.ossean.match.model.Project;
import com.ossean.match.model.RelativeMemo;
import com.ossean.match.model.RelativeMemoToOpenSourceProject;
import com.ossean.match.pipeline.ResultPipeline;
import com.ossean.match.utils.Normalizer;
import javax.annotation.Resource;
@ -38,8 +36,6 @@ public class Match {
private double titleWeight = 0.8;
private double threshholdWeight = 1.3;
@Resource
private ResultPipeline resultPipeline;
@Resource
private RelativeMemoToOpenSourceProjectDao matchResultDao;
@Resource
@ -59,14 +55,11 @@ public class Match {
try {
Directory memoDire = FSDirectory.open(Paths
.get(LuceneIndex.MEMOS_INDEX_PATH));
Directory prjDire = FSDirectory.open(Paths
.get(LuceneIndex.PRJS_INDEX_PATH));
if (!DirectoryReader.indexExists(memoDire)) { // if the index directory
// doesn't exist
return;
}
IndexReader memoIndexReader = DirectoryReader.open(memoDire);
IndexReader prjIndexReader = DirectoryReader.open(prjDire);
for(Project project : projects) {
int prjId = 0;
String prjName = "";
@ -78,11 +71,14 @@ public class Match {
if(project.getTags() != null)
prjTags = project.getTags().toLowerCase();
prjName = project.getName().toLowerCase();
//long start1 = System.currentTimeMillis();
// 项目名匹配帖子标题
if (prjName.length() >= 2 && !Normalizer.isAllNumber(prjName)) {
// 项目名匹配帖子标题
matchMap = LuceneSearch.prjToMemoMatchByLucene(prjName,
LuceneIndex.titleFieldName, titleWeight, matchMap, memoIndexReader, prjIndexReader);
LuceneIndex.titleFieldName, titleWeight, matchMap, memoIndexReader);
// 项目名匹配帖子标签
matchMap = LuceneSearch.searchMemoTags(
prjName, LuceneIndex.memoTagsFieldName, LuceneIndex.memoIdFieldName, memoTagsWeight,
matchMap, memoIndexReader);
}
if (project.getSynonyms() != null) {
synonyms = project.getSynonyms().toLowerCase();
@ -91,88 +87,39 @@ public class Match {
//项目别名匹配帖子标题 TODO
for (String synonym : synonymsList) {
matchMap = LuceneSearch.prjToMemoMatchByLucene(synonym,
LuceneIndex.titleFieldName, titleWeight, matchMap, memoIndexReader, prjIndexReader);
LuceneIndex.titleFieldName, titleWeight, matchMap, memoIndexReader);
}
//项目别名匹配帖子标签 TODO
for (String synonym : synonymsList) {
matchMap = LuceneSearch.searchMemoTags(
synonym, LuceneIndex.memoTagsFieldName, memoTagsWeight,
synonym, LuceneIndex.memoTagsFieldName, LuceneIndex.memoIdFieldName, memoTagsWeight,
matchMap, memoIndexReader);
}
}
}
//long end1 = System.currentTimeMillis();
//logger.info("项目名匹配帖子标题: " + (end1 - start1) + "ms");
// 项目名匹配帖子标签 TODO 词频统计可直接从索引获得
if (prjName.length() >= 2 && !Normalizer.isAllNumber(prjName)) {
Set<String> tagStrings = resultPipeline.getTagbyPrjId(prjId); // 这里要返回标签匹配到的项目id
for (String tag : tagStrings) {
matchMap = LuceneSearch.searchMemoTags(
tag.toLowerCase(), LuceneIndex.memoTagsFieldName, memoTagsWeight,
matchMap, memoIndexReader);
}
}
//long end2 = System.currentTimeMillis();
//logger.info("项目名匹配帖子标签: " + (end2 - end1) + "ms");
// 用项目标签搜索帖子标题
//HashMap<Integer, Integer> tagsCount = null;
if (prjTags != null && prjTags.length() > 0 && matchMap.size() > 0) {
List<String> prjTagsList = Normalizer.tagsSegmentation(prjTags);
// 用项目标签搜索帖子标题
for (String prjTag : prjTagsList) {
matchMap = LuceneSearch.searchByPrjTagInMemoTitle(
prjTag, LuceneIndex.titleFieldName, LuceneIndex.memoIdFieldName, prjTagsToMemoTitleWeight,
matchMap, memoIndexReader);
}
/*tagsCount = LuceneSearch.tagsMatch(
LuceneIndex.memoIdFieldName, prjTags,
LuceneIndex.titleFieldName, indexReader);
// tagsCount是<postId, x>指每个post的标题匹配到的tag数量
if (tagsCount.size() > 0) {
for (Map.Entry<Integer, Integer> t : tagsCount.entrySet()) {
int postId = t.getKey();
int x = t.getValue();
if (matchMap.containsKey(postId)) {
matchMap.put(postId, matchMap.get(postId) + matchMap.get(postId) * 0.5 * (Math.log(x * x + 1) / Math.log(2)));
}
}
}*/
//long end3 = System.currentTimeMillis();
//logger.info("用项目标签搜索帖子标题: " + (end3 - end2) + "ms");
// 用项目标签搜索帖子标签
for (String prjTag : prjTagsList) {
matchMap = LuceneSearch.searchByPrjTag(
prjTag, LuceneIndex.memoTagsFieldName, LuceneIndex.memoIdFieldName, prjTagsToMemoTagsWeight,
matchMap, memoIndexReader);
}
/*tagsCount = LuceneSearch.tagsMatch(
LuceneIndex.memoIdFieldName, prjTags,
LuceneIndex.memoTagsFieldName, indexReader);
if (tagsCount.size() > 0) {
for (Map.Entry<Integer, Integer> t : tagsCount
.entrySet()) {
int postId = t.getKey();
int y = t.getValue();
if (matchMap.containsKey(postId)) {
matchMap.put(postId, matchMap.get(postId) + matchMap.get(postId) * 0.6 * (Math.log(y * y + 1) / Math.log(2)));
}
}
}*/
//long end4 = System.currentTimeMillis();
//logger.info("用项目标签搜索帖子标签: " + (end4 - end3) + "ms");
}
//long end5 = System.currentTimeMillis();
if (matchMap.size() > 0)
insertPrjToMemoMatchResult(prjId, matchMap);
matchMap.clear();
//long end6 = System.currentTimeMillis();
//logger.info("当前项目匹配结果入库: " + (end6 - end5) + "ms");
projectDao.updateProcessedPrj(prjId);
logger.info("current prjId: " + prjId);
}
memoIndexReader.close();
memoDire.close();
prjIndexReader.close();
prjDire.close();
} catch (IOException e) {
logger.error("prjToMemoMatch io error in Match: " + e);
}
@ -263,13 +210,13 @@ public class Match {
public void memoToPrjMatch(List<RelativeMemo> memos) {
HashMap<Integer, Double> matchMap = new HashMap<Integer, Double>();
try {
Directory dire = FSDirectory.open(Paths
Directory prjDire = FSDirectory.open(Paths
.get(LuceneIndex.PRJS_INDEX_PATH));
if (!DirectoryReader.indexExists(dire)) { // if the index directory
if (!DirectoryReader.indexExists(prjDire)) { // if the index directory
// doesn't exist
return;
}
IndexReader indexReader = DirectoryReader.open(dire);
IndexReader prjIndexReader = DirectoryReader.open(prjDire);
for(RelativeMemo memo : memos) {
int memoId = 0;
String memoTitle = "";
@ -288,88 +235,48 @@ public class Match {
if (Normalizer.isAllNumber(memoTitle)) {
continue;
}
//long start1 = System.currentTimeMillis();
List<String> memoTitleList = Normalizer.getList(memoTitle);
if (memoTitleList.size() > 0) {
// 帖子标题匹配项目名
matchMap = LuceneSearch.memoToPrjMatchByLucene(memoTitle,
memoTitleList, LuceneIndex.prjNameFieldName,
titleWeight, matchMap, indexReader);
titleWeight, matchMap, prjIndexReader);
//帖子标题搜索项目别名
matchMap = LuceneSearch.memoToPrjMatchByLucene(memoTitle,
memoTitleList, LuceneIndex.prjSynonymsFieldName,
titleWeight, matchMap, indexReader);
titleWeight, matchMap, prjIndexReader);
}
//long end1 = System.currentTimeMillis();
//logger.info("帖子标题匹配项目名: " + (end1 - start1) + "ms");
List<String> memoTagsList = Normalizer.tagsSegmentation(memo.getTags().toLowerCase());
// 用帖子标签搜索项目名
if (memoTags.length() > 0) {
Set<Integer> projectIds = resultPipeline.getPrjIdForMemo(memoId); // 这里要返回标签匹配到的项目id
for (int pId : projectIds) {
if (matchMap.containsKey(pId)){
matchMap.put(pId, matchMap.get(pId) + memoTagsWeight);
}
else {
matchMap.put(pId, memoTagsWeight);
}
if (memoTags.length() > 0 && memoTagsList.size() > 0) {
//用帖子标签搜索项目名
for (String memoTag : memoTagsList) {
matchMap = LuceneSearch.searchMemoTags(memoTag,
LuceneIndex.prjNameFieldName, LuceneIndex.prjIdFieldName, memoTagsWeight, matchMap, prjIndexReader);
}
//用帖子标签搜索项目别名
for (String tag : memoTagsList) {
matchMap = LuceneSearch.searchMemoTags(tag,
LuceneIndex.prjSynonymsFieldName, LuceneIndex.prjIdFieldName, memoTagsWeight, matchMap, prjIndexReader);
}
}
//long end2 = System.currentTimeMillis();
//logger.info("帖子标签搜索项目名: " + (end2 - end1) + "ms");
if (matchMap.size() > 0) {
// 帖子标题匹配项目标签
if (memoTitleList != null && memoTitleList.size() > 0) {
for (String memoTitleTerm : memoTitleList) {
matchMap = LuceneSearch.searchByPrjTag(
memoTitleTerm, LuceneIndex.prjTagsFieldName, LuceneIndex.prjIdFieldName, prjTagsToMemoTitleWeight,
matchMap, indexReader);
matchMap, prjIndexReader);
}
}
//long end3 = System.currentTimeMillis();
//logger.info("帖子标题匹配项目标签: " + (end3 - end2) + "ms");
/*HashMap<Integer, Integer> tagsCount = null;
tagsCount = LuceneSearch.tagsMatch(
LuceneIndex.prjIdFieldName, memoTitle,
LuceneIndex.prjTagsFieldName, indexReader);
// tagsCount是<prjId, x>x指匹配到prjTag的数量
if (tagsCount.size() > 0) {
for (Map.Entry<Integer, Integer> t : tagsCount
.entrySet()) {
int prjId = t.getKey();
int x = t.getValue();
if (matchMap.containsKey(prjId)) {
matchMap.put(prjId, matchMap.get(prjId) + matchMap.get(prjId) * 0.5
* (Math.log(x * x + 1) / Math.log(2)));
}
}
} */
// 用帖子标签搜索项目标签
if (memo.getTags() != null && memo.getTags().length() > 0) {
List<String> memoTagsList = Normalizer.tagsSegmentation(memo.getTags().toLowerCase());
for (String memoTag : memoTagsList) {
matchMap = LuceneSearch.searchByPrjTag(
memoTag, LuceneIndex.prjTagsFieldName, LuceneIndex.prjIdFieldName, prjTagsToMemoTagsWeight,
matchMap, indexReader);
matchMap, prjIndexReader);
}
}
//long end4 = System.currentTimeMillis();
//logger.info("帖子标签搜索项目标签: " + (end4 - end3) + "ms");
/*if (memoTags.length() > 0) {
tagsCount = LuceneSearch.tagsMatch(
LuceneIndex.prjIdFieldName, memoTags,
LuceneIndex.prjTagsFieldName, indexReader);
if (tagsCount.size() > 0) {
for (Map.Entry<Integer, Integer> t : tagsCount.entrySet()) {
int prjId = t.getKey();
int y = t.getValue();
if (matchMap.containsKey(prjId)) {
matchMap.put(prjId, matchMap.get(prjId) + matchMap.get(prjId) * 0.6
* (Math.log(y * y + 1) / Math.log(2)));
}
}
}
}*/
if (matchMap.size() > 0)
insertMemoToPrjMatchResult(memoId, matchMap);
}
@ -377,8 +284,8 @@ public class Match {
matchMap.clear();
logger.info("current MemoId: " + memoId);
}
indexReader.close();
dire.close();
prjIndexReader.close();
prjDire.close();
} catch (Exception e) {
logger.error("memoToPrjMatch error in Match: " + e);
}

View File

@ -11,21 +11,11 @@ import org.slf4j.LoggerFactory;
public class MatchIncrement extends Thread implements Runnable {
private Logger logger = LoggerFactory.getLogger(getClass());
private int prjHistory; // the record of project after matching last time
private int memoHistory; // the record of memo after matching last time
private int prjIndexed; //the record of prj indexed to
private int memoIndexed; //the record of memo indexed to
private static File record; // record file "record.txt"
private boolean newPrjComing; //新项目进入标识
private boolean newMemoComing; //新帖子进入标识
private boolean currentMatchDone = true; //当前批量匹配是否结束
private int lastPrjId = 0; //最大项目id
private int lastTagId = 0; //最大标签id
private int tagHistory = 0; //标签匹配记录
private boolean newTagflag; //新标签进入标识
private boolean newPrePrjFlag; //新的项目需要做标签匹配
private int prePrjHistory = 0; //项目与标签匹配记录
private int taskToPrjId = 0;
private int sleepTime = 0;
public boolean isNewPrjComing() {
@ -52,91 +42,11 @@ public class MatchIncrement extends Thread implements Runnable {
this.sleepTime = sleepTime;
}
public int getTaskToPrjId() {
return taskToPrjId;
}
public void setTaskToPrjId(int taskToPrjId) {
this.taskToPrjId = taskToPrjId;
}
public int getMemoIndexed() {
return memoIndexed;
}
public void setMemoIndexed(int memoIndexed) {
this.memoIndexed = memoIndexed;
}
public int getPrjIndexed() {
return prjIndexed;
}
public void setPrjIndexed(int prjIndexed) {
this.prjIndexed = prjIndexed;
}
public boolean isNewPrePrjFlag() {
return newPrePrjFlag;
}
public void setNewPrePrjFlag(boolean newPrePrjFlag) {
this.newPrePrjFlag = newPrePrjFlag;
}
public int getPrePrjHistory() {
return prePrjHistory;
}
public void setPrePrjHistory(int prePrjHistory) {
this.prePrjHistory = prePrjHistory;
}
public int getTagHistory() {
return tagHistory;
}
public void setTagHistory(int tagHistory) {
this.tagHistory = tagHistory;
}
public boolean isNewTagflag() {
return newTagflag;
}
public void setNewTagflag(boolean newTagflag) {
this.newTagflag = newTagflag;
}
public void setLastPrjId(int id) {
this.lastPrjId = id;
}
public int getLastPrjId() {
return lastPrjId;
}
public int getLastTagId() {
return lastTagId;
}
public void setLastTagId(int id) {
this.lastTagId = id;
}
public MatchIncrement() {
record = new File("record.txt");
getMatchRecord();
}
public int getPrjHistory() {
return prjHistory;
}
public synchronized void setPrjHistory(int prjHistory) {
this.prjHistory = prjHistory;
}
public int getMemoHistory() {
return memoHistory;
}
@ -160,13 +70,7 @@ public class MatchIncrement extends Thread implements Runnable {
public void getMatchRecord() {
try {
Scanner in = new Scanner(record);
this.setPrjHistory(in.nextInt());
this.setMemoHistory(in.nextInt());
this.setTagHistory(in.nextInt());
this.setPrePrjHistory(in.nextInt());
this.setPrjIndexed(in.nextInt());
this.setMemoIndexed(in.nextInt());
this.setTaskToPrjId(in.nextInt());
in.close();
} catch (FileNotFoundException e) {
logger.error("getMatchRecord FileNotFoundException " + e);
@ -179,8 +83,7 @@ public class MatchIncrement extends Thread implements Runnable {
public void writeIntoRecord() {
try {
FileOutputStream fout = new FileOutputStream(record);
fout.write((prjHistory + "\t" + memoHistory + "\t" + tagHistory
+ "\t" + prePrjHistory + "\t" + prjIndexed + "\t" + memoIndexed + "\t" + taskToPrjId).getBytes());
fout.write((memoHistory + "").getBytes());
fout.close();
} catch (FileNotFoundException e) {
logger.error("writeIntoRecord FileNotFoundException " + e);

View File

@ -2,7 +2,6 @@ package com.ossean.match.matchprocess;
import java.io.IOException;
import java.sql.SQLException;
import java.text.DecimalFormat;
import java.util.List;
import javax.annotation.Resource;
@ -41,7 +40,6 @@ public class MemoToPrjMatch implements Runnable {
public void memoToPrjIncrement() throws SQLException,
IOException {
matchIncrement.setCurrentMatchDone(false);
long start = System.currentTimeMillis();
matchIncrement.getMatchRecord();
int startId = matchIncrement.getMemoHistory();
int endId = startId + step;
@ -49,25 +47,15 @@ public class MemoToPrjMatch implements Runnable {
if ((lastMemoId - endId) <= step) {
endId = lastMemoId;
}
LuceneIndex luceneIndex = new LuceneIndex();
while (startId < lastMemoId) {
long batchStart = System.currentTimeMillis();
List<RelativeMemo> memos = memoDao.getMemoInfo(startId, endId);
if (memos != null && memos.size() > 0) {
match.memoToPrjMatch(memos);
matchIncrement.setMemoHistory(endId);
if (startId > matchIncrement.getMemoIndexed()) {
LuceneIndex luceneIndex = new LuceneIndex();
int memoIndexed = luceneIndex.memoIndexIncrement(memos);
// adding memos index and save the record
matchIncrement.setMemoIndexed(memoIndexed);
}
// adding memos index and save the record
luceneIndex.memoIndexIncrement(memos);
}
long batchEnd = System.currentTimeMillis();
DecimalFormat df = new DecimalFormat( "0.00");
double totalTime = (double)(batchEnd - batchStart)/60000;
double rate = (double)(endId - startId)/((batchEnd - batchStart)/1000);
logger.info("current Batch memo-->prj total time: " + df.format(totalTime) + "min; "
+ " average " + df.format(rate) + " 条/s");
logger.info("relative_memos_to_osps matched to : "
+ matchIncrement.getMemoHistory());
logger.info("relative_memos_to_osps remain_to_match : "
@ -83,13 +71,8 @@ public class MemoToPrjMatch implements Runnable {
endId += step;
}
}
long end = System.currentTimeMillis();
DecimalFormat df = new DecimalFormat( "0.00");
double totalTime = (double)(end - start)/60000;
logger.info("memo-->prj total time: " + df.format(totalTime) + "min");
matchIncrement.setCurrentMatchDone(true);
}
public void run() {
if (matchIncrement.isNewMemoComing()) {
logger.info("start relative_memos_to_osps match!!!");

View File

@ -26,13 +26,9 @@ public class NewPrjMonitor {
public void run() {
int newPrjCount = projectDao.getNewPrjCount();
int lastPrjId = projectDao.getNewLast();
int lastMemoId = memoDao.getLastMemoId();
int lastTagId = tagDao.getNewLast();
if (newPrjCount > 0) {
matchIncrement.setNewPrjComing(true);
matchIncrement.setLastPrjId(lastPrjId);
} else {
matchIncrement.setNewPrjComing(false);
}
@ -42,19 +38,6 @@ public class NewPrjMonitor {
matchIncrement.setNewMemoComing(false);
}
if (newPrjCount > 0) {
matchIncrement.setNewPrePrjFlag(true);
} else {
matchIncrement.setNewPrePrjFlag(false);
}
if (lastTagId > matchIncrement.getTagHistory()) {
matchIncrement.setNewTagflag(true);
matchIncrement.setLastTagId(lastTagId);
} else {
matchIncrement.setNewTagflag(false);
}
if (!matchIncrement.isNewMemoComing()&&!matchIncrement.isNewPrjComing()) {
matchIncrement.setSleepTime(30000);
}else {

View File

@ -2,7 +2,6 @@ package com.ossean.match.matchprocess;
import java.io.IOException;
import java.sql.SQLException;
import java.text.DecimalFormat;
import java.util.List;
import javax.annotation.Resource;
@ -34,73 +33,31 @@ public class PrjToMemoMatch implements Runnable {
public void prjToMemoIncrement() throws IOException, SQLException {
matchIncrement.setCurrentMatchDone(false);
long start = System.currentTimeMillis();
matchIncrement.getMatchRecord();
List<Project> newPrjs = projectDao.getBatchPrjsIncre(step);// 批量读取filtration为1或者filtration为2且update_mark不为空的项目即未匹配的和已匹配但有更新的
if(newPrjs == null || newPrjs.size() == 0){
matchIncrement.setPrjHistory(matchIncrement.getLastPrjId());
matchIncrement.writeIntoRecord();
}
// 批量读取filtration为1或者filtration为2且update_mark不为空的项目即未匹配的和已匹配但有更新的
List<Project> newPrjs = projectDao.getBatchPrjsIncre(step);
while (newPrjs != null && newPrjs.size() > 0) {
long batchStart = System.currentTimeMillis();
long end1 = System.currentTimeMillis();
logger.info("批量读项目: " + (end1 - batchStart) + "ms");
if (newPrjs != null) {
match.prjToMemoMatch(newPrjs);
matchIncrement.setPrjHistory(newPrjs.get(newPrjs.size()-1).getId());
// adding projects index and save the record
// adding projects index
LuceneIndex luceneIndex = new LuceneIndex();
int prjIndexed = luceneIndex.prjIndexIncrement(newPrjs);
matchIncrement.setPrjIndexed(prjIndexed);
luceneIndex.prjIndexIncrement(newPrjs);
}
long batchEnd = System.currentTimeMillis();
DecimalFormat df = new DecimalFormat( "0.00");
double totalTime = (double)(batchEnd - batchStart)/60000;
double rate = (double)(newPrjs.size())/((batchEnd - batchStart)/1000);
logger.info("current Batch prj-->memo total time: " + df.format(totalTime) + "min; "
+ " average " + df.format(rate) + " 条/s");
// finishing matching, writing the record to "record.txt"
matchIncrement.writeIntoRecord();
newPrjs = projectDao.getBatchPrjsIncre(step);// getting projects' id and prjName in batches
// getting projects' id and prjName in batches
newPrjs = projectDao.getBatchPrjsIncre(step);
}
long end = System.currentTimeMillis();
DecimalFormat df = new DecimalFormat( "0.00");
double totalTime = (double)(end - start)/60000;
logger.info("prj-->memo total time: " + df.format(totalTime) + "min");
matchIncrement.setCurrentMatchDone(true);
}
public void run() {
if (matchIncrement.getTaskToPrjId() != 0) {
int startId = matchIncrement.getPrjHistory();
int endId = matchIncrement.getTaskToPrjId();
long batchStart = System.currentTimeMillis();
List<Project> newPrjs = projectDao.getBatchPrjs(startId, endId);// getting projects' id and prjName in batches
long end1 = System.currentTimeMillis();
logger.info("批量读项目: " + (end1 - batchStart) + "ms");
if (newPrjs != null) {
match.prjToMemoMatch(newPrjs);
}
long batchEnd = System.currentTimeMillis();
DecimalFormat df = new DecimalFormat( "0.00");
double totalTime = (double)(batchEnd - batchStart)/60000;
double rate = (double)(endId - startId)/((batchEnd - batchStart)/1000);
logger.info("current Batch prj-->memo total time: " + df.format(totalTime) + "min; "
+ " average " + df.format(rate) + " 条/s");
logger.info("osps_to_relative_memos matched to : "
+ matchIncrement.getPrjHistory());
System.exit(0);
}
else {
if (matchIncrement.isNewPrjComing()) {
logger.info("start osps_to_relative_memos match!!!");
try {
prjToMemoIncrement();
} catch (IOException e) {
logger.error("IOException: " + e);
} catch (SQLException e) {
logger.error("SQLException: " + e);
}
if (matchIncrement.isNewPrjComing()) {
logger.info("start osps_to_relative_memos match!!!");
try {
prjToMemoIncrement();
} catch (IOException e) {
logger.error("IOException: " + e);
} catch (SQLException e) {
logger.error("SQLException: " + e);
}
}

View File

@ -1,68 +0,0 @@
package com.ossean.match.model;
public class Taggings {
private int id;
private int tag_id;
private int taggable_id;
private String taggable_type;
private int tagger_id;
private String tagger_type;
private String context;
private String created_at;
private int disagree_num;
public int getId() {
return id;
}
public void setId(int id) {
this.id = id;
}
public int getTag_id() {
return tag_id;
}
public void setTag_id(int tag_id) {
this.tag_id = tag_id;
}
public int getTaggable_id() {
return taggable_id;
}
public void setTaggable_id(int taggable_id) {
this.taggable_id = taggable_id;
}
public String getTaggable_type() {
return taggable_type;
}
public void setTaggable_type(String taggable_type) {
this.taggable_type = taggable_type;
}
public int getTagger_id() {
return tagger_id;
}
public void setTagger_id(int tagger_id) {
this.tagger_id = tagger_id;
}
public String getTagger_type() {
return tagger_type;
}
public void setTagger_type(String tagger_type) {
this.tagger_type = tagger_type;
}
public String getContext() {
return context;
}
public void setContext(String context) {
this.context = context;
}
public String getCreated_at() {
return created_at;
}
public void setCreated_at(String created_at) {
this.created_at = created_at;
}
public int getDisagree_num() {
return disagree_num;
}
public void setDisagree_num(int disagree_num) {
this.disagree_num = disagree_num;
}
}

View File

@ -1,57 +0,0 @@
package com.ossean.match.pipeline;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import javax.annotation.Resource;
import org.springframework.stereotype.Component;
import org.springframework.transaction.annotation.Transactional;
import com.ossean.match.dao.AtomicItemDao;
import com.ossean.match.model.Atom;
@Component
public class AtomicItemPipeline {
@Resource
private AtomicItemDao atomicItemDao;
// 把原子性的
@Transactional
public void pipelineAtoms(Map<String, ArrayList<Integer>> atoms) {
// TODO Auto-generated method stub
Set<String> atomNames = atoms.keySet();
ArrayList<Integer> nums;
for (String name : atomNames) {
nums = atoms.get(name);
atomicItemDao.insertAtomicItem(name, nums.get(0), nums.get(1));
}
}
@Transactional
// 0: 未更新
// 1: 数据更新
// 2: 数据插入
public void pipelineAtoms(List<Atom> atoms) {
// TODO Auto-generated method stub
for (Atom atom : atoms) {
int status = atom.getStatus();
switch (status) {
case 0:
break;
case 1:
atomicItemDao.updateAtomicItem(atom.getName(),
atom.getNumTag(), atom.getNumInProj());
break;
default:
atomicItemDao.insertAtomicItem(atom.getName(),
atom.getNumTag(), atom.getNumInProj());
}
}
}
}

View File

@ -1,163 +0,0 @@
package com.ossean.match.pipeline;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import javax.annotation.Resource;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Component;
import org.springframework.transaction.annotation.Transactional;
import com.ossean.match.dao.ResultDao;
import com.ossean.match.dao.TaggingDao;
import com.ossean.match.model.Project;
@Component
public class ResultPipeline {
@Resource
private ResultDao resultDao;
@Resource
private TaggingDao taggingDao;
private Logger logger = LoggerFactory.getLogger(getClass());
private LinkedList<Item> items1 = new LinkedList<Item>();
private LinkedList<Item> items2 = new LinkedList<Item>();
private LinkedList<Item> items3 = new LinkedList<Item>();
private LinkedList<Project> projects = new LinkedList<Project>();
@Transactional
public void insertResult(int prjId, String string, int tagId,
String tagOriginalName, float score) {
}
@Transactional
public Set<Integer> getPrjIdForMemo(int memoId) {
List<Integer> tagIds = taggingDao.getTagIdForMemo(memoId);
Set<Integer> result = new HashSet<Integer>();
for (int tagId : tagIds) {
List<Integer> tempIntegers = resultDao.getMatchPrjId(tagId);
result.addAll(tempIntegers);
}
return result;
}
public Set<String> getTagbyPrjId(int prjId){
List<String> tags = resultDao.getMatchTag(prjId);
Set<String> result = new HashSet<String>();
for (String tag : tags) {
// List<String> tempIntegers = resultDao.getMatchPrjId(tagId);
result.add(tag);
}
return result;
}
@Transactional
public void updateRank(Project project, int flag) {
if (flag == 1) {
projects.add(project);
if (projects.size() >= 10000) {
for (Project tproject : projects) {
System.out.println("handle project:" + tproject.getId());
resultDao.updateRank(tproject.getRank(),
tproject.getStar(), tproject.getId());
}
projects.clear();
}
} else {
for (Project tproject : projects) {
System.out.println("handle project:" + tproject.getId());
resultDao.updateRank(tproject.getRank(), tproject.getStar(),
tproject.getId());
}
projects.clear();
}
}
@Transactional
public void insertResult1(int prjId, String prjName, int tagId,
String tagName, float score) {
items1.add(new Item(prjId, prjName, tagId, tagName, score));
// 10000条输入数据库一次
if (items1.size() >= 10000) {
for (Item item : items1) {
resultDao.insertResult1(item.getPrjId(), item.getPrjName(),
item.getTagId(), item.getTagName(), item.getScore());
}
items1.clear();
}
}
@Transactional
public void insertResult2(int prjId, String prjName, int tagId,
String tagName, float score) {
items2.add(new Item(prjId, prjName, tagId, tagName, score));
// 10000条输入数据库一次
if (items2.size() >= 10000) {
for (Item item : items2) {
resultDao.insertResult2(item.getPrjId(), item.getPrjName(),
item.getTagId(), item.getTagName(), item.getScore());
}
items2.clear();
}
}
@Transactional
public void insertResult3(int prjId, String prjName, int tagId,
String tagName, float score) {
items3.add(new Item(prjId, prjName, tagId, tagName, score));
// 10000条输入数据库一次
if (items3.size() >= 10000) {
logger.info("match tag and project :" + prjId);
for (Item item : items3) {
resultDao.insertResult3(item.getPrjId(), item.getPrjName(),
item.getTagId(), item.getTagName(), item.getScore());
}
items3.clear();
}
}
}
class Item {
public Item(int prjId, String prjName, int tagId, String tagName,
float score) {
super();
this.tagName = tagName;
this.prjId = prjId;
this.prjName = prjName;
this.tagId = tagId;
this.score = score;
}
public int getPrjId() {
return prjId;
}
public String getPrjName() {
return prjName;
}
public int getTagId() {
return tagId;
}
public float getScore() {
return score;
}
public String getTagName() {
return tagName;
}
int prjId;
String prjName;
int tagId;
String tagName;
float score;
}

View File

@ -38,7 +38,7 @@
<property name="url"
value="jdbc:mysql://localhost:3306/ossean_production?characterEncoding=UTF-8" />
<property name="username" value="root" />
<property name="password" value="1234" />
<property name="password" value="123456" />
<property name="initialSize" value="5" />
<property name="maxActive" value="100" />
</bean>

View File

@ -16,4 +16,4 @@ JVM_ARGS="-Xmn98m -Xmx512m -Xms512m -XX:NewRatio=4 -XX:SurvivorRatio=4 -XX:MaxTe
#echo JVM_ARGS=$JVM_ARGS
#ulimit -n 400000
#echo "" > nohup.out
java $JVM_ARGS -classpath $CLASSPATH com.ossean.projectmanager.HotwordsMain >>log/hotwords.log 2>&1 &
java $JVM_ARGS -classpath $CLASSPATH com.ossean.projectmanager.HotwordsMain > log/hotwords.log 2>&1 &

View File

@ -17,4 +17,4 @@ JVM_ARGS="-Xmn98m -Xmx1024m -Xms512m -XX:NewRatio=4 -XX:SurvivorRatio=4 -XX:MaxT
#echo JVM_ARGS=$JVM_ARGS
#ulimit -n 400000
#echo "" > nohup.out
java $JVM_ARGS -classpath $CLASSPATH com.ossean.projectmanager.ProjectsFilterMain >>log/projectsfilter.log 2>&1 &
java $JVM_ARGS -classpath $CLASSPATH com.ossean.projectmanager.ProjectsFilterMain > log/projectsfilter.log 2>&1 &

View File

@ -39,9 +39,9 @@
destroy-method="close">
<property name="driverClassName" value="com.mysql.jdbc.Driver" />
<property name="url"
value="jdbc:mysql://172.16.128.30:3306/ossean_production?characterEncoding=UTF-8&amp;zeroDateTimeBehavior=convertToNull&amp;autoReconnect=true" />
value="jdbc:mysql://172.16.128.36:3306/ossean_production?characterEncoding=UTF-8&amp;zeroDateTimeBehavior=convertToNull&amp;autoReconnect=true" />
<property name="username" value="gather" />
<property name="password" value="influx1234" />
<property name="password" value="Influx@1234" />
<property name="validationQuery" value="SELECT 1" />
<property name="testOnBorrow" value="true"/>
</bean>

View File

@ -17,4 +17,4 @@ JVM_ARGS="-Xmn98m -Xmx1024m -Xms512m -XX:NewRatio=4 -XX:SurvivorRatio=4 -XX:MaxT
#echo JVM_ARGS=$JVM_ARGS
#ulimit -n 400000
#echo "" > nohup.out
java $JVM_ARGS -classpath $CLASSPATH com.ossean.projectmanager.SyncMatchResultMain >>log/syncmatchresult.log 2>&1 &
java $JVM_ARGS -classpath $CLASSPATH com.ossean.projectmanager.SyncMatchResultMain > log/syncmatchresult.log 2>&1 &

View File

@ -17,4 +17,4 @@ JVM_ARGS="-Xmn98m -Xmx512m -Xms512m -XX:NewRatio=4 -XX:SurvivorRatio=4 -XX:MaxTe
#echo JVM_ARGS=$JVM_ARGS
#ulimit -n 400000
#echo "" > nohup.out
java $JVM_ARGS -classpath $CLASSPATH com.ossean.projectmanager.UpdateOspTagsMain >>log/updateosptags.log 2>&1 &
java $JVM_ARGS -classpath $CLASSPATH com.ossean.projectmanager.UpdateOspTagsMain > log/updateosptags.log 2>&1 &

View File

@ -16,7 +16,8 @@ public class UpdateOspTagsMain {
private UpdateOspTags updateTags;
public void start(){
updateTags.updateOspTags();
// updateTags.updateOspTags();
updateTags.restorePrjTaggings();
}
public static void main(String[] args){

View File

@ -13,6 +13,7 @@ import com.ossean.projectmanager.lasttabledao.TagDao;
import com.ossean.projectmanager.lasttabledao.TaggingsDao;
import com.ossean.projectmanager.model.OpenSourceProject;
import com.ossean.projectmanager.model.Taggings;
import com.ossean.projectmanager.utils.DataHandler;
/**
*
@ -28,6 +29,9 @@ public class UpdateOspTags {
private TagDao tagDao;
@Resource
private TaggingsDao taggingsDao;
private String taggingsTableName = "taggings";
private String tagsTableName = "tags";
/**
* 根据taggings更新项目标签字段tags和权重更高的标签字段tags_for_search
@ -67,5 +71,50 @@ public class UpdateOspTags {
}
}
public void restorePrjTaggings(){
logger.info("start updateOspTags ......");
int start = 0;
int prjId = 0;
while (start < 2000000) {
List<OpenSourceProject> ospList = ospDao.getProjectsByBatch(start,
5000);
for (OpenSourceProject osp : ospList) {
prjId = osp.getId();
// 将tag和项目的关系存入表item_tag_relation 并分离tag
String tags = osp.getTags();
if (tags == null) {
// 表示该项目没有标签
continue;
}
List<String> tagList = DataHandler
.tagsSegmentation(tags);
for (String tag : tagList) {
ospDao.insertTag(tagsTableName, tag);// ignore方式插入该项目的标签
int tag_id = ospDao.selectTagIdByName(
tagsTableName, tag);
Taggings taggings = new Taggings();
taggings.setTag_id(tag_id);
taggings.setTaggable_id(prjId);
taggings.setTaggable_type("OpenSourceProject");
taggings.setContext("tags");
taggings.setCreated_at(DataHandler.getNow());
// 将Taggings对象存入数据库中
try {
ospDao.insertTaggings(taggingsTableName,
taggings);
} catch (Exception e) {
// 在插入记录之前 relative_memos表中的记录已经被删除掉了
logger.error("insert taggins error: " + e);
System.exit(0);
}
}
logger.info("currentPrjId: " + prjId);
}
logger.info("last prj batch end, currentPrjId: " + prjId);
start = prjId + 5000;
}
}
}

View File

@ -2,15 +2,32 @@ package com.ossean.projectmanager.lasttabledao;
import java.util.List;
import org.apache.ibatis.annotations.Insert;
import org.apache.ibatis.annotations.Param;
import org.apache.ibatis.annotations.Select;
import org.apache.ibatis.annotations.Update;
import com.ossean.projectmanager.model.OpenSourceProject;
import com.ossean.projectmanager.model.Taggings;
public interface OpenSourceProjectDao {
//获得最大项目id
// 向tag表存储数据
@Insert("insert ignore into ${table} (name) values (#{name})")
public void insertTag(@Param("table") String table,
@Param("name") String name);
// 根据tag的name找tag的id
@Select("select id from ${table} where name=#{name}")
public int selectTagIdByName(@Param("table") String table,
@Param("name") String name);
// 插入Taggings对象
@Insert("insert ignore into ${table} (tag_id,taggable_id,taggable_type,context,created_at,disagree_num) values (#{model.tag_id},#{model.taggable_id},#{model.taggable_type},#{model.context},#{model.created_at},#{model.disagree_num})")
public void insertTaggings(@Param("table") String table,
@Param("model") Taggings model);
// 获得最大项目id
@Select("select MAX(id) from open_source_projects")
public Integer getNewLast();
@ -19,10 +36,9 @@ public interface OpenSourceProjectDao {
public List<OpenSourceProject> getProjectsByBatch(
@Param("start") int start, @Param("size") int size);
//更新项目标签字段tags和权重更高的标签字段tags_for_search
// 更新项目标签字段tags和权重更高的标签字段tags_for_search
@Update("update open_source_projects set tags=#{tags}, tags_for_search = #{tagsForSearch} where id=#{id}")
public void updatePrjTags(@Param("id") int id,
@Param("tags") String tags,
public void updatePrjTags(@Param("id") int id, @Param("tags") String tags,
@Param("tagsForSearch") String tagsForSearch);
// 对项目标签属性进行更新

View File

@ -68,9 +68,9 @@ public class ProjectsFilter {
.getOpenHubPrjByUrl(url); // 根据url从openhub的项目分表获得项目信息
if (openhubProject != null
&& openhubProject.getName() != null
&& openhubProject.getName() != ""
&& !"".equals(openhubProject.getName())
&& openhubProject.getDescription() != null
&& openhubProject.getDescription() != ""
&& !"".equals(openhubProject.getDescription())
&& openhubProject.getCodeRepository() != null
&& !openhubProject.getCodeRepository().contains(
"add a code location")) { // openhub的筛选条件为namedescription不为空且该项目有版本库
@ -80,10 +80,11 @@ public class ProjectsFilter {
matchResultDao.deleteMatchResult(
getTargetTable(project.getId()),
project.getId()); // 删除该项目的匹配结果确保无之前的匹配结果
} else {
lastProjectDao.updateFiltratedPrj(project.getId(),
2); // 筛选标识由1或2变为2表示该项目之前就是筛选作为保留的
}
}
// else {
// lastProjectDao.updateFiltratedPrj(project.getId(),
// 2); // 筛选标识由1或2变为2表示该项目之前就是筛选作为保留的
// }
} else {
lastProjectDao.updateFiltratedPrj(project.getId(), 0); // 筛选标识变为0表示该项目不保留
matchResultDao.deleteMatchResult(
@ -95,9 +96,9 @@ public class ProjectsFilter {
.getSourceForgePrjByUrl(url); // 根据url从SourceForge的项目分表获得项目信息
if (sourceforgeProject != null
&& sourceforgeProject.getName() != null
&& sourceforgeProject.getName() != ""
&& !"".equals(sourceforgeProject.getName())
&& sourceforgeProject.getDescription() != null
&& sourceforgeProject.getDescription() != ""
&& !"".equals(sourceforgeProject.getDescription())
&& ((sourceforgeProject.getDownload_num() > 0) || (sourceforgeProject
.getStars() > 0))) {
if (project.getFilration() == 0) {
@ -106,10 +107,11 @@ public class ProjectsFilter {
matchResultDao.deleteMatchResult(
getTargetTable(project.getId()),
project.getId()); // 删除该项目的匹配结果确保无之前的匹配结果
} else {
lastProjectDao.updateFiltratedPrj(project.getId(),
2); // 筛选标识由1或2变为2表示该项目之前就是筛选作为保留的
}
}
// else {
// lastProjectDao.updateFiltratedPrj(project.getId(),
// 2); // 筛选标识由1或2变为2表示该项目之前就是筛选作为保留的
// }
} else {
lastProjectDao.updateFiltratedPrj(project.getId(), 0); // 筛选标识变为0表示该项目不保留
matchResultDao.deleteMatchResult(
@ -122,9 +124,10 @@ public class ProjectsFilter {
matchResultDao.deleteMatchResult(
getTargetTable(project.getId()),
project.getId()); // 删除该项目的匹配结果确保无之前的匹配结果
} else {
lastProjectDao.updateFiltratedPrj(project.getId(), 2); // 筛选标识由1或2变为2表示该项目之前就是筛选作为保留的
}
}
// else {
// lastProjectDao.updateFiltratedPrj(project.getId(), 2); // 筛选标识由1或2变为2表示该项目之前就是筛选作为保留的
// }
} else {
logger.info("Unknown source... source = " + source);
}

View File

@ -0,0 +1,38 @@
package com.ossean.projectmanager.utils;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class DataHandler {
// 获取当前时间的String
public static String getNow() {
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
String result = sdf.format(new Date());
return result;
}
// 标签分离函数
public static List<String> tagsSegmentation(String tags) {
List<String> tag = new ArrayList<String>();
if (tags != null) {
String regex = "<[^<>]*>";
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(tags);
while (matcher.find()) {
String t = matcher.group();
t = t.substring(1, t.length() - 1);
tag.add(t);
}
}
return tag;
}
}

View File

@ -19,9 +19,9 @@
destroy-method="close">
<property name="driverClassName" value="com.mysql.jdbc.Driver" />
<property name="url"
value="jdbc:mysql://172.16.128.33:3306/ossean_production?characterEncoding=UTF-8&amp;zeroDateTimeBehavior=convertToNull&amp;autoReconnect=true" />
value="jdbc:mysql://localhost:3306/ossean_production?characterEncoding=UTF-8&amp;zeroDateTimeBehavior=convertToNull&amp;autoReconnect=true" />
<property name="username" value="root" />
<property name="password" value="NUDTpdl@" />
<property name="password" value="NUDTpdl@123" />
<property name="validationQuery" value="SELECT 1" />
<property name="testOnBorrow" value="true"/>
</bean>

View File

@ -46,7 +46,7 @@
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>3.8.1</version>
<version>4.10</version>
<scope>test</scope>
</dependency>

View File

@ -16,15 +16,35 @@ public class ClearTable {
Logger logger = Logger.getLogger(ClearTable.class);
public void start(){
truncateTable("edd_pointers");
truncateTable("synonyms");
truncateTable("edd_relations");
truncateTable("synonymmings");
truncateTable("open_source_projects");
deleteTaggings4Project();
deleteItemInEddPointers("gather_projects", "edd_relations");
//deleteTaggings4Project();
logger.info("完成去重程序运行前数据表的清空和taggings表OpenSourceProject的删除");
}
//删除edd_pointers表中transfer对应的数据
public void deleteItemInEddPointers(String sourceTableName, String targetTableName){
String sql = "delete from edd_pointers where SourceTableName=? and TargetTableName=?";
Connection conn = getConnection();
if(conn == null){
logger.info("没有获取到Connection");
System.exit(-1);
}
try {
PreparedStatement ps = conn.prepareStatement(sql);
ps.setString(1, sourceTableName);
ps.setString(2, targetTableName);
ps.execute();
ps.close();
conn.close();
} catch (SQLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public void truncateTable(String table){
String sql = "truncate table " + table;
Connection conn = getConnection();
@ -65,7 +85,7 @@ public class ClearTable {
Connection conn = null;
try {
Class.forName("com.mysql.jdbc.Driver").newInstance();
conn = DriverManager.getConnection("jdbc:mysql://172.16.128.30:3306/ossean", "root", "123456");
conn = DriverManager.getConnection("jdbc:mysql://localhost:3306/ossean_production", "root", "NUDTpdl@123");
} catch (InstantiationException e) {
e.printStackTrace();
} catch (IllegalAccessException e) {

View File

@ -1,98 +1,113 @@
package com.ossean;
import java.util.List;
import javax.annotation.Resource;
import org.apache.log4j.Logger;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Qualifier;
import org.springframework.context.ApplicationContext;
import org.springframework.context.support.ClassPathXmlApplicationContext;
import org.springframework.stereotype.Component;
import com.ossean.databaseSource.DBSource;
import com.ossean.databaseSource.GatherDao;
import com.ossean.databaseSource.UpdateControlProjectsDao;
import com.ossean.model.GatherProjectsModel;
import com.ossean.util.MergeProjectNew;
import com.ossean.util.MergeProjectsUtil;
@Component
public class MergeProjects {
Logger logger = Logger.getLogger(this.getClass());
@Resource
private DBSource dbSource;
@Resource
private GatherDao gatherDao;
@Resource
private UpdateControlProjectsDao updateControlDao;
@Qualifier("mergeProjectNew")
@Autowired
private MergeProjectNew mergeProjectNew;
private int startId1;
private static String pointerTableName = "edd_pointers";
private static String sourceTableName = "gather_projects";
private static String targetTableName = "edd_relations";
private static int batchSize = 500;
//读指针
public int readPointer(String table, String source, String target, int minId){
int pointer = minId;
try {
pointer = dbSource.getPointer(table, source, target);
} catch(Exception e) {
logger.info("No such pointer! Create one");
dbSource.insertPointer(table, source, target, pointer);
}
return pointer;
}
public void start(){
int minId = 1;
int maxId = dbSource.getMaxId(sourceTableName);
while(true){
startId1 = readPointer(pointerTableName, sourceTableName, targetTableName, minId);
if(startId1 > maxId){
logger.info("finish program! HAHAHA");
System.exit(0);
}
List<GatherProjectsModel> gpmList1 = gatherDao.selectGPMList(sourceTableName, startId1, batchSize, maxId);
if(gpmList1.size() == 0){
try {
logger.info("no increase item to be handled! handle update items");
List<GatherProjectsModel> updateProjectList = gatherDao.selectUpdateProjects(sourceTableName);
for(GatherProjectsModel model:updateProjectList){
mergeProjectNew.handleNewProject(model, true);
gatherDao.updateUpdateMark(sourceTableName, null, model.getId());
}
logger.info("all update items have been solved. Sleep 1h");
Thread.sleep(3600*1000);
continue;
} catch (InterruptedException e) {
e.printStackTrace();
}
}
for(GatherProjectsModel model1:gpmList1){
logger.info("handling project : " + model1.getId());
mergeProjectNew.handleNewProject(model1, false);
//如果是对更新表中的数据进行操作 则不更新指针
dbSource.updatePointer(pointerTableName, sourceTableName, targetTableName, model1.getId() + 1);
gatherDao.updateUpdateMark(sourceTableName, null, model1.getId()); //需要更新update_mark字段 因为按顺序取的时候可能取到1的
}
}
}
public static void main(String[] args){
ApplicationContext applicationContext = new ClassPathXmlApplicationContext("classpath:/applicationContext*.xml");
MergeProjects Main = applicationContext.getBean(MergeProjects.class);
Main.start();
}
}
package com.ossean;
import java.util.List;
import javax.annotation.Resource;
import org.apache.log4j.Logger;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Qualifier;
import org.springframework.context.ApplicationContext;
import org.springframework.context.support.ClassPathXmlApplicationContext;
import org.springframework.stereotype.Component;
import com.ossean.databaseSource.DBSource;
import com.ossean.databaseSource.GatherDao;
import com.ossean.databaseSource.UpdateControlProjectsDao;
import com.ossean.model.GatherProjectsModel;
import com.ossean.util.MergeProjectNew;
import com.ossean.util.MergeProjectsUtil;
@Component
public class MergeProjects {
Logger logger = Logger.getLogger(this.getClass());
@Resource
private DBSource dbSource;
@Resource
private GatherDao gatherDao;
@Resource
private UpdateControlProjectsDao updateControlDao;
@Qualifier("mergeProjectNew")
@Autowired
private MergeProjectNew mergeProjectNew;
private int startId1;
private static String pointerTableName = "edd_pointers";
private static String sourceTableName = "gather_projects";
private static String targetTableName = "edd_relations";
private static int batchSize = 500;
//读指针
public int readPointer(String table, String source, String target, int minId){
int pointer = minId;
try {
pointer = dbSource.getPointer(table, source, target);
} catch(Exception e) {
logger.info("No such pointer! Create one");
dbSource.insertPointer(table, source, target, pointer);
}
return pointer;
}
public void start(){
//int minId = 1;
//int maxId = dbSource.getMaxId(sourceTableName);
int count=0;
count = readPointer(pointerTableName, sourceTableName, targetTableName, count);
while(true){
//startId1 = readPointer(pointerTableName, sourceTableName, targetTableName, minId);
// if(startId1 > maxId){
//
// try {
// logger.info("没有数据了休眠10min");
// Thread.sleep(600*1000L);
// continue;
// } catch (InterruptedException e) {
// e.printStackTrace();
// }
// }
List<GatherProjectsModel> gpmList1 = gatherDao.selectGPMList(sourceTableName,batchSize);
if(gpmList1.size() == 0){
try {
logger.info("no increase item to be handled! handle update items");
List<GatherProjectsModel> updateProjectList = gatherDao.selectUpdateProjects(sourceTableName,batchSize);
if(updateProjectList.size() == 0){
logger.info("all update items have been solved. Sleep 10 mins");
Thread.sleep(600*1000);
continue;
}
else
for(GatherProjectsModel model:updateProjectList){
mergeProjectNew.handleNewProject(model, true);
gatherDao.updateUpdateMark(sourceTableName, 1, model.getId());
}
} catch (InterruptedException e) {
e.printStackTrace();
}
}
long start = System.currentTimeMillis();
for(GatherProjectsModel model1:gpmList1){
logger.info("handling project : " + model1.getId());
mergeProjectNew.handleNewProject(model1, false);
count = count+1;
dbSource.updatePointer(pointerTableName, sourceTableName, targetTableName, count);
gatherDao.updateUpdateMark(sourceTableName, 1, model1.getId()); //需要更新update_mark字段 因为按顺序取的时候可能取到1的
}
long end = System.currentTimeMillis();
logger.warn("deal with: 500 projects cost: "+(float)(end - start)/60000+" minutes");
}
}
public static void main(String[] args){
ApplicationContext applicationContext = new ClassPathXmlApplicationContext("classpath:/applicationContext*.xml");
MergeProjects Main = applicationContext.getBean(MergeProjects.class);
Main.start();
}
}

View File

@ -73,7 +73,7 @@ public class MergeProjectsOld {
}else{}
while(true){
startId1 = readPointer(pointerTableName, sourceTableName, edd_relations_pointer, minId);
List<GatherProjectsModel> gpmList1 = gatherDao.selectGPMList(sourceTableName, startId1, batchSize, maxId);
List<GatherProjectsModel> gpmList1 = gatherDao.selectGPMList(sourceTableName, batchSize);
if(gpmList1.size() == 0){
try {
logger.info("no increase item to be handled! handle update items");
@ -92,7 +92,7 @@ public class MergeProjectsOld {
List<GatherProjectsModel> updateProjectList = gatherDao.selectUpdateProjects(sourceTableName);
for(GatherProjectsModel model:updateProjectList){
mergeProjectNew.handleNewProject(model, true);
gatherDao.updateUpdateMark(sourceTableName, null, model.getId());
gatherDao.updateUpdateMark(sourceTableName, 1, model.getId());
}
logger.info("all update items have been solved. Sleep 1h");
Thread.sleep(3600*1000);
@ -110,7 +110,7 @@ public class MergeProjectsOld {
//如果是对更新表中的数据进行操作 则不更新指针
dbSource.updatePointer(pointerTableName, sourceTableName, edd_relations_pointer, model1.getId() + 1);
gatherDao.updateUpdateMark(sourceTableName, null, model1.getId()); //需要更新update_mark字段 因为按顺序取的时候可能取到1的
gatherDao.updateUpdateMark(sourceTableName, 1, model1.getId()); //需要更新update_mark字段 因为按顺序取的时候可能取到1的
}
}
}

View File

@ -26,13 +26,13 @@ public interface DBDest {
*/
//向open_source_projects表中插入对象数据
@Insert("insert into ${table} (id,name,description,"
+ "url,url_md5,language,category,"
+ "url,language,category,"
+ "source,created_time,updated_time,"
+ "tags,tags_for_search,synonyms,license,homepage,extracted_time,update_mark) values (#{model.id},#{model.name},#{model.description},"
+ "#{model.url},#{model.url_md5},#{model.language},"
+ "tags,tags_for_search,synonyms,update_mark) values (#{model.id},#{model.name},#{model.description},"
+ "#{model.url},,#{model.language},"
+ "#{model.category},#{model.source},"
+ "#{model.created_time},#{model.updated_time},#{model.tags},"
+ "#{model.tags_for_search},#{model.synonyms},#{model.license},#{model.homepage},#{model.extracted_time},#{model.update_mark})")
+ "#{model.tags_for_search},#{model.synonyms},#{model.update_mark})")
public void insertOsp(@Param("table") String table, @Param("model") OpenSourceProject model);
//查找刚刚插入open_source_projects表中的记录id

View File

@ -87,8 +87,10 @@ public interface GatherDao {
//根据id和limit取GatherProjectsModel列表
@Select("select * from ${table} where id>=#{start} and id<=#{maxId} order by id asc limit #{size}")
public List<GatherProjectsModel> selectGPMList(@Param("table") String table, @Param("start") int start, @Param("size") int size, @Param("maxId") int maxId);
//@Select("select * from ${table} where id>=#{start} and id<=#{maxId} order by id asc limit #{size}")
//public List<GatherProjectsModel> selectGPMList(@Param("table") String table, @Param("start") int start, @Param("size") int size, @Param("maxId") int maxId);
@Select("select * from ${table} where update_mark = 0 limit #{size}")
public List<GatherProjectsModel> selectGPMList(@Param("table") String table, @Param("size") int size);
@Select("select * from ${table} where id>=#{start} and id<#{end} limit #{size}")
public List<GatherProjectsModel> selectGPMListBySize(@Param("table") String table, @Param("start") int start, @Param("end") int end, @Param("size") int size);
@ -106,12 +108,12 @@ public interface GatherDao {
public List<GatherProjectsModel> selectGPMBySameName(@Param("table") String table, @Param("name") String name, @Param("id") int id);
//取出所有update_mark为1的更新数据
@Select("select * from ${table} where update_mark=1")
public List<GatherProjectsModel> selectUpdateProjects(@Param("table") String table);
//取出所有update_mark为2的更新数据
@Select("select * from ${table} where update_mark=2 limit #{size}")
public List<GatherProjectsModel> selectUpdateProjects(@Param("table") String table,@Param("size") int size);
//更新update_mark字段
@Select("update ${table} set update_mark=#{update_mark} where id=#{id}")
public void updateUpdateMark(@Param("table") String table, @Param("update_mark") String update_mark, @Param("id") int id);
public void updateUpdateMark(@Param("table") String table, @Param("update_mark") int update_mark, @Param("id") int id);
}

View File

@ -45,7 +45,7 @@ public class StringHandler {
}
}
resultStrArr = tmp.toArray(new String[0]);
System.out.println("bangbangbang"+resultStrArr[0]+" "+resultStrArr[1]);
//System.out.println("bangbangbang"+resultStrArr[0]+" "+resultStrArr[1]);
return resultStrArr;
}

View File

@ -23,6 +23,16 @@
<param name="ConversionPattern" value="%d{yy-MM-dd HH:mm:ss,SSS} %-5p %c(%F:%L) ## %m%n" />
</layout>
</appender>
<appender name="file_log1" class="org.apache.log4j.DailyRollingFileAppender">
<param name="File" value="./log/time_cost.txt" />
<param name="threshold" value="WARN" />
<layout class="org.apache.log4j.PatternLayout">
<param name="ConversionPattern" value="%d{yy-MM-dd HH:mm:ss,SSS} %-5p %c(%F:%L) ## %m%n" />
</layout>
</appender>
<!-- 邮件只有ERROR时才会发送 -->
@ -47,13 +57,13 @@
<level value="warn" />
<appender-ref ref="stdout" />
</logger>
<root>
<level value="info" />
<appender-ref ref="stdout" />
<appender-ref ref="file" />
<appender-ref ref="file_log" />
<appender-ref ref="file_log1" />
</root>
</log4j:configuration>

View File

@ -1,126 +0,0 @@
package foo;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.annotation.Resource;
import org.apache.log4j.Logger;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Qualifier;
import org.springframework.context.ApplicationContext;
import org.springframework.test.context.ContextConfiguration;
import org.springframework.test.context.junit4.SpringJUnit4ClassRunner;
import com.ossean.databaseSource.DBSource;
import com.ossean.databaseSource.GatherDao;
import com.ossean.model.GatherProjectsModel;
import com.ossean.util.MergeProjectNew;
import com.ossean.util.MergeProjectsUtil;
import com.ossean.util.RegexHandler;
import com.ossean.util.StringHandler_ProjectName;
@RunWith(SpringJUnit4ClassRunner.class)
@ContextConfiguration(locations = {"classpath*:/applicationContext*.xml"})
public class Testcase {
@Autowired
ApplicationContext ctx;
Logger logger = Logger.getLogger(this.getClass());
@Resource
private DBSource dbSource;
@Resource
private GatherDao gatherDao;
@Qualifier("mergeProjectsUtil")
@Autowired
private MergeProjectsUtil mergeProjectsUtil;
@Qualifier("mergeProjectNew")
@Autowired
private MergeProjectNew mergeProjectNew;
public static void main(String[] args) {
}
@Test
public void testGetSynonyms(){
List<GatherProjectsModel> prj_list = gatherDao.selectGPMList("gather_projects_test", 11000, 100, 1135058);
GatherProjectsModel gpm = gatherDao.selectGPMById("gather_projects", 11085);//1220,1995,1194,11085
GatherProjectsModel gpm2 = gatherDao.selectGPMById("gather_projects", 237094);
List<String> list= mergeProjectNew.getSynonyms(gpm);
String str = "";
for (int j = 0;j<list.size();j++){
str =str+list.get(j)+",";
}
System.out.println("prj id:"+gpm.getId()+"; prj name:"+gpm.getName()+"; synonyms:"+str);
// for(int i=0;i<prj_list.size();i++){
// List<String> list= mergeProjectNew.getSynonyms(prj_list.get(i));
//
// String str = "";
// for (int j = 0;j<list.size();j++){
// str =str+list.get(j)+",";
// }
// System.out.println("prj id:"+prj_list.get(i).getId()+"; prj name:"+prj_list.get(i).getName()+"; synonyms:"+str);
// }
// String a = "abc-hu";
// System.out.println(RegexHandler.extractEngDecChiAndDot(a));
}
@Test
public void testCalSimilarityForTag(){
GatherProjectsModel gpm_1 = gatherDao.selectGPMById("gather_projects", 87361);
GatherProjectsModel gpm_2 = gatherDao.selectGPMById("gather_projects", 64);
//GatherProjectsModel gpm_3 = gatherDao.selectGPMById("gather_projects", 99700);87361
// 770684
// 907638
// 1032900
// 1032911
//boolean isSame = mergeProjectNew.isTheSameAfterTFIDF(gpm_1, gpm_2, false);
// double similarity = mergeProjectNew.calSimilarityByDescription(gpm_1, gpm_2);
// System.out.println("calSimilarityByDesc"+String.valueOf(similarity));
// double similarity2 = mergeProjectNew.calSimilarityByTag(gpm_1, gpm_2);
// System.out.println("calSimilarityByTag"+String.valueOf(similarity2));
mergeProjectNew.handleNewProject(gpm_1, false);
//System.out.println("the two model is same: "+isSame);
// List<String> synonymList = mergeProjectsUtil.getSynonyms(gpm_2);
// for (int i =0;i<synonymList.size();i++)
// System.out.println(synonymList.get(i));
}
// 标签分离函数
public static List<String> tagsSegmentation(String tags) {
List<String> tag = new ArrayList<String>();
if (tags != null) {
String regex = "<[^<>]*>";
//一个Pattern是一个正则表达式经编译后的表现模式
Pattern pattern = Pattern.compile(regex);
//一个Matcher对象是一个状态机器它依据Pattern对象做为匹配模式对字符串展开匹配检查
Matcher matcher = pattern.matcher(tags);
while (matcher.find()) {
String t = matcher.group();
t = t.substring(1, t.length() - 1);
tag.add(t);
}
}
return tag;
}
}

View File

@ -1,5 +1,10 @@
source 'http://ruby.taobao.org/'
<<<<<<< HEAD
#source 'http://ruby.taobao.org/'
source 'https://rubygems.org'
=======
source 'https://ruby.taobao.org/'
#source 'https://rubygems.org'
>>>>>>> tmp
unless RUBY_PLATFORM =~ /w32/
# unix-like only

View File

@ -60,11 +60,11 @@ class OpenSourceProjectsController < ApplicationController
#@open_source_project.update_column(:view_num_ossean, (@open_source_project.view_num_ossean.to_i+1))
#@open_source_project.save
@pops=@open_source_project.open_source_project_popularities.where("year_col >= 2013").order("year_col ASC, month_col ASC")
@hot_words=@open_source_project.hot_words
# @pops=@open_source_project.open_source_project_popularities.where("year_col >= 2013").order("year_col ASC, month_col ASC")
# @hot_words=@open_source_project.hot_words
@memo = RelativeMemo.new(:open_source_project => @open_source_project)
@topics_count = @open_source_project.relative_memos_num
@bugs = @open_source_project.relative_bugs.limit(6)
# @bugs = @open_source_project.relative_bugs.limit(6)
###########推荐#############################################################
@simility_osp_array ,@simility_id_array,@simility_weight_array ,@simility_osp_des= recommend(params[:id],"simility")
@ -205,6 +205,32 @@ class OpenSourceProjectsController < ApplicationController
language = ''
language << @language
#search_words << ' ' + @language if @language
# 请求starlee服务
require 'net/http'
star_projects = []
parse_string = "http://172.16.128.35:5000/correlation_search?q=" + @name
# parse_string = "http://172.20.10.3:5000/correlation_search?q=" + @name
parse_string.gsub! " ","%20"
url = URI.parse(parse_string)
req = Net::HTTP::Get.new(url.to_s)
res = Net::HTTP.start(url.host, url.port) {|http|
http.request(req)
}
star_items = res.body
require 'json'
star_items = JSON.parse(star_items)["items"]
star_project_ids = star_items[0,5]
star_projects = []
# 查询出id对应的项目
star_project_ids.each do |id|
id = id.to_i
project = OpenSourceProject.find_by_id(id)
star_projects << project
end
search = OpenSourceProject.search do
without(:filtration,0)
with(:created_time, Date.new(params[:created_time].to_i, 01, 01)..Date.new(params[:created_time].to_i+1, 01, 01)) if params[:created_time].present? && !('earlier'.eql? params[:created_time])
@ -240,12 +266,36 @@ class OpenSourceProjectsController < ApplicationController
else
order_by(:score, :desc)
end
paginate :page => params[:page], :per_page => 10 #the default paginator of solr,10 records for each page
paginate :page => params[:page], :per_page => 10 # the default paginator of solr,10 records for each page
end
per_page_option = 10
@hits = search.hits
@open_source_projects = search.results
@projects_count = search.total #get total count of search
open_source_projects_index = 0
while open_source_projects_index<@open_source_projects.length do
project = @open_source_projects[open_source_projects_index]
id = project.id
star_projects.each do |star_project|
if id.to_i == star_project.id
@open_source_projects.delete(project)
open_source_projects_index -= 1
break
end
end
open_source_projects_index += 1
end
if params[:page]==nil || params[:page]==1
star_index = star_projects.length-1
while star_index>=0 do
star_project = star_projects[star_index]
@open_source_projects.unshift(star_project)
star_index -= 1
end
end
# if @projects_count <= 10
# search_words1 = search_words.unpack("U*").select{ |p| (0x4e00..0x9fa5).member? p }.pack("U*")
# if search_words.include?('客户端')
@ -271,6 +321,8 @@ class OpenSourceProjectsController < ApplicationController
# @projects_count = search.total + search_1.total
# end
@projects_pages = Paginator.new @projects_count, per_page_option, params['page'] #custom paginator,to show the 10 records the default paginator of solr gets
respond_to do |format|
format.html { render :layout => "base" }
end
@ -408,7 +460,8 @@ class OpenSourceProjectsController < ApplicationController
@created_time = params[:created_time]
@projects_count=OpenSourceProject.count
@posts_count = RelativeMemo.count
# @posts_count = RelativeMemo.count
@posts_count = 14243800
#@open_source_projects=OpenSourceProject.find_hot_osps(20)
@projects_for_cloud = OpenSourceProject.where("filtration != 0").order("composite_score desc").limit(60)

View File

@ -1689,11 +1689,11 @@ module ApplicationHelper
# nav_list.push(stores_link) if @nav_dispaly_store_all_label
#osp_index
nav_list.push(osp_index_link) if @nav_dispaly_osp_index_label
# nav_list.push(osp_index_link) if @nav_dispaly_osp_index_label
#highchart_index
nav_list.push(highchart_index_link) if @nav_dispaly_osp_index_label
# nav_list.push(highchart_index_link) if @nav_dispaly_osp_index_label
#sta_git_match
nav_list.push(sta_git_match_link) if @nav_dispaly_osp_index_label
# nav_list.push(sta_git_match_link) if @nav_dispaly_osp_index_label
content_li = ''
nav_list.collect do |nav_item|

View File

@ -246,12 +246,12 @@ class OpenSourceProject < ActiveRecord::Base
#tag_ids = ActsAsTaggableOn::Tagging.select(:tag_id).where("taggable_id = ? AND taggable_type = ? AND disagree_num > ?", obj_id, "OpenSourceProject", 5)
# ActsAsTaggableOn::Tag.includes(ActsAsTaggableOn::Tagging).where("taggable_id = ? AND taggable_type = ? AND disagree_num > ?", obj_id, "OpenSourceProject", -1)
#tag_list = ActsAsTaggableOn::Tag.select(:name).where(id:tag_ids)
tag_list = ActsAsTaggableOn::Tag.find_by_sql ["SELECT `tags`.* FROM `tags`"+
" INNER JOIN `taggings` ON `tags`.`id` = `taggings`.`tag_id` WHERE"+
"`taggings`.`taggable_id` = #{self.id}"+
" AND `taggings`.`taggable_type` = 'OpenSourceProject' AND `taggings`.`disagree_num` > 5" +
" AND (taggings.context = 'tags' AND taggings.tagger_id IS NULL) LIMIT #{limit}"]
return tag_list
# tag_list = ActsAsTaggableOn::Tag.find_by_sql ["SELECT `tags`.* FROM `tags`"+
# " INNER JOIN `taggings` ON `tags`.`id` = `taggings`.`tag_id` WHERE"+
# "`taggings`.`taggable_id` = #{self.id}"+
# " AND `taggings`.`taggable_type` = 'OpenSourceProject' AND `taggings`.`disagree_num` > 5" +
# " AND (taggings.context = 'tags' AND taggings.tagger_id IS NULL) LIMIT #{limit}"]
# return tag_list
end
def get_tag_list_userTag(limit = 9)

View File

@ -18,7 +18,7 @@
"http://www.miibeian.gov.cn/", :target => "_blank" %></span>
</p>
<div id="logo_link">
<!-- <div id="logo_link">
<span class="footer_logo_link"><%= link_to image_tag('/images/footer_logo/nudt.png',
:size=>'100x30',:alt=>"国防科学技术大学计算机学院"),
"http://www.nudt.edu.cn/special.asp?classid=12", :target => "_blank" %></span>
@ -34,7 +34,7 @@
<span class="footer_logo_link"><%= link_to image_tag('/images/footer_logo/inforbus.png',
:size=>'100x30',:alt=>"山东中创软件商用中间件股份有限公司"),
"http://www.inforbus.com", :target => "_blank" %></span>
</div>
</div> -->
<!--gcm-->
</div>

View File

@ -75,7 +75,7 @@
</div>
<% end -%>
<%#= render_menu :top_menu if User.current.logged? || !Setting.login_required? -%>
<%= render_dynamic_nav if User.current.logged? || !Setting.login_required? -%>
<%#= render_dynamic_nav if User.current.logged? || !Setting.login_required? -%>
<%# 自建导航条在base页面中以 (@nav_dispaly......) 开头变量设定, 全局搜索即可发现 %>
</div>
<div style="border-top:solid 1px #C6E9F1;width:940px;margin-left:auto;margin-right:auto;margin-bottom: 0px;margin-top: -10px;"></div>

View File

@ -52,7 +52,7 @@
<div style="width: 940px; margin: auto;">
<div>
<div class="welcome_logo">
<%= link_to image_tag("/images/logo.png", weight: "36px", height: "36px"), home_path %>
<!-- <%= link_to image_tag("/images/logo.png", weight: "36px", height: "36px"), home_path %> -->
</div>
<% if User.current.logged? -%>
<div id="loggedas">
@ -75,7 +75,7 @@
<% end -%>
<%#= render_menu :top_menu if User.current.logged? || !Setting.login_required? -%>
<%= render_dynamic_nav if User.current.logged? || !Setting.login_required? -%>
<%#= render_dynamic_nav if User.current.logged? || !Setting.login_required? -%>
<%# 自建导航条在base页面中以 (@nav_dispaly......) 开头变量设定, 全局搜索即可发现 %>
</div>
<div style="clear: both;"></div>

View File

@ -24,9 +24,9 @@
<p class="stats">
<!--%= link_to project_info.followers_num, open_source_project_path(project) %-->
<a>
<%= (project.relative_memos_num.nil?)?0:project.relative_memos_num.to_i %>
<!-- <%= (project.relative_memos_num.nil?)?0:project.relative_memos_num.to_i %> -->
</a>
<%= content_tag('span', l(:label_x_relative_topics_count, :count => 0)) %>
<!-- <%= content_tag('span', l(:label_x_relative_topics_count, :count => 0)) %> -->
</p>
@ -35,10 +35,10 @@
<p class="stats">
<!--%= link_to project_info.visit_num, open_source_project_path(project) %-->
<a>
<%= short_num (project.view_num_ossean.nil?)?0:project.view_num_ossean %>
<!-- <%= short_num (project.view_num_ossean.nil?)?0:project.view_num_ossean %> -->
<%#= short_num ((project.view_num.nil?)?0:project.view_num)+project.view_num_ossean+project.relative_topics.sum('view_num') %>
</a>
<%= content_tag('span', l(:label_view_num)) %>
<%#= content_tag('span', l(:label_view_num)) %>
</p>
<!--<p class="stats">-->
<!--%=link_to project.code_line %-->

View File

@ -17,16 +17,16 @@
</div>
<div class="wiki-description">
<%#= textilizable(project.short_description, :project => project) %>
<%= show_project_descriptions project.result.description, 300 %>
<%= show_project_descriptions project.description, 300 %>
</div>
<div class="information">
<p class="stats">
<!--%= link_to project_info.followers_num, open_source_project_path(project) %-->
<%#= link_to project_info.followers_num, open_source_project_path(project) %>
<a>
<%= (project.result.relative_memos_num.nil?)?0:project.result.relative_memos_num.to_i %>
<%#= (project.result.relative_memos_num.nil?)?0:project.result.relative_memos_num.to_i %>
</a>
<%= content_tag('span', l(:label_x_relative_topics_count, :count => 0)) %>
<%#= content_tag('span', l(:label_x_relative_topics_count, :count => 0)) %>
</p>
<!--
@ -42,10 +42,10 @@
<p class="stats">
<!--%= link_to project_info.visit_num, open_source_project_path(project) %-->
<a>
<%= short_num((project.result.view_num_ossean.nil?)?0:project.result.view_num_ossean)%>
<%#= short_num((project.result.view_num_ossean.nil?)?0:project.result.view_num_ossean)%>
<%#= short_num ((project.view_num.nil?)?0:project.view_num)+project.view_num_ossean+project.relative_topics.sum('view_num') %>
</a>
<%= content_tag('span', l(:label_view_num)) %>
<%#= content_tag('span', l(:label_view_num)) %>
</p>
<!-- <p class="stats">
@ -78,7 +78,7 @@
<!--</div>-->
<div class="licences">
<%= content_tag('span', "#{l(:label_create_time)}: ") %>
<%= content_tag('span', project.result.created_time.to_s[0,10]) %>
<%= content_tag('span', project.created_time.to_s[0,10]) %>
</div>
<!--gcm-->
@ -99,6 +99,6 @@
<div class="tags">
<div id="tags">
<%= image_tag("/images/sidebar/tags.png") %>
<%= render :partial => 'tags/tag_name', :locals => {:obj => project.result, :object_flag => "8", :non_list_all => true} %>
<%= render :partial => 'tags/tag_name', :locals => {:obj => project, :object_flag => "8", :non_list_all => true} %>
</div>
</div>

View File

@ -86,7 +86,7 @@
<%= text_field_tag :name, nil, placeholder:'请输入要搜索的关键字', :size => 27, :class => "topics-search-text-field" %>
</div>
<%#= hidden_field_tag 'project_type', project_type %>
<%= submit_tag l(:label_search_topics), :class => "enterprise", :name => nil, :onclick => "user_trace_search_knowledge_in_search_bar('#{request.session_options[:id]}');" %>
<%#= submit_tag l(:label_search_topics), :class => "enterprise", :name => nil, :onclick => "user_trace_search_knowledge_in_search_bar('#{request.session_options[:id]}');" %>
</div>
<% end %>
<%###################search topics###########################%>

View File

@ -207,15 +207,33 @@
<div data-group="nav-topbar">
<div class="nav-topbar-content">
<ul>
<li class="nav-search-con">查找条件><span class="nav-topbar-arror"></span>
<!-- <li class="nav-search-con">查找条件><span class="nav-topbar-arror"></span>
</li>
<li>
<span class="topbar-search J_TopbarSearch">
<%= show_condition(@app_dir, @language, @created_time, params[:name]) %>
</span>
</li>
-->
<%#= show_condition(@app_dir, @language, @created_time, params[:name]) %>
<!-- </span>
</li> -->
<li class="nav-search-con">共 <span class="h"><%= @projects_count %></span> 个开源项目</li>
<li class="nav-search-con" style="margin-left: 50px">开发语言:
<%= link_to "JAVA", search_open_source_projects_path(:language => "JAVA", :app_dir => @app_dir, :created_time => @created_time, :name => params[:name]), :class => "nav-more J_More show" %>
<%= link_to "C++", search_open_source_projects_path(:language => "c++", :app_dir => @app_dir, :created_time => @created_time, :name => params[:name]), :class => "nav-more J_More show" %>
<%= link_to "C", search_open_source_projects_path(:language => "c", :app_dir => @app_dir, :created_time => @created_time, :name => params[:name]), :class => "nav-more J_More show" %>
<%= link_to "C#", search_open_source_projects_path(:language => "c#", :app_dir => @app_dir, :created_time => @created_time, :name => params[:name]), :class => "nav-more J_More show" %>
<%= link_to "ruby", search_open_source_projects_path(:language => "ruby", :app_dir => @app_dir, :created_time => @created_time, :name => params[:name]), :class => "nav-more J_More show" %>
</li>
<li class="nav-search-con" style="margin-left: 50px">创立时间:
<%= link_to "2014", search_open_source_projects_path(:app_dir => @app_dir, :language => @language, :created_time => "2014", :name => params[:name]), :class => "nav-more J_More show" %>
<%= link_to "2013", search_open_source_projects_path(:app_dir => @app_dir, :language => @language, :created_time => "2013", :name => params[:name]), :class => "nav-more J_More show" %>
<%= link_to "2012", search_open_source_projects_path(:app_dir => @app_dir, :language => @language, :created_time => "2012", :name => params[:name]), :class => "nav-more J_More show" %>
<%= link_to "2011", search_open_source_projects_path(:app_dir => @app_dir, :language => @language, :created_time => "2011", :name => params[:name]), :class => "nav-more J_More show" %>
<%= link_to "2010", search_open_source_projects_path(:app_dir => @app_dir, :language => @language, :created_time => "2010", :name => params[:name]), :class => "nav-more J_More show" %>
<%= link_to "更早", search_open_source_projects_path(:app_dir => @app_dir, :language => @language, :created_time => "earlier", :name => params[:name]), :class => "nav-more J_More show" %>
</li>
<!-- <li class="related-count"><a href="/search?q=iphone&amp;app=vproduct&amp;vlist=1&amp;from_combo=true">8款相关产品</a></li> -->
</ul>
</div>
@ -237,7 +255,7 @@
<span class="arrow arrow-bottom"></span>
</a>
</div> -->
<div class="nav-category ">
<!-- <div class="nav-category ">
<h4 title="网络类型">开发语言<span></span></h4>
<%= link_to "JAVA", search_open_source_projects_path(:language => "JAVA", :app_dir => @app_dir, :created_time => @created_time, :name => params[:name]), :class => "nav-more J_More show" %>
<%= link_to "C++", search_open_source_projects_path(:language => "c++", :app_dir => @app_dir, :created_time => @created_time, :name => params[:name]), :class => "nav-more J_More show" %>
@ -248,8 +266,8 @@
<span class="arrow arrow-bottom"></span>
</a>
</div>
<div class="nav-category ">
</div> -->
<!-- <div class="nav-category ">
<h4 title="网络类型">创立时间<span></span></h4>
<%= link_to "2014", search_open_source_projects_path(:app_dir => @app_dir, :language => @language, :created_time => "2014", :name => params[:name]), :class => "nav-more J_More show" %>
<%= link_to "2013", search_open_source_projects_path(:app_dir => @app_dir, :language => @language, :created_time => "2013", :name => params[:name]), :class => "nav-more J_More show" %>
@ -260,7 +278,8 @@
<span class="arrow arrow-bottom"></span>
</a>
</div>
</div> -->
<p></p>
</div>
<div class="nav-category-group type-block" data-group="type-block">
<div class="nav-flexbox ">
@ -285,12 +304,12 @@
<div id="projects-index">
<ul class='projects'>
<% for i in (0..@hits.length-1) %>
<% project = @hits[i] %>
<% for i in (0..@open_source_projects.length-1) %>
<% project = @open_source_projects[i] %>
<% if !project.nil? %>
<li class='project-table'>
<div class='root'>
<%= link_to project.result.name, open_source_project_path(project.result), :class => "project root leaf", :onclick => "user_trace_click_project_in_project_list_after_search(this,#{i+1},'#{request.session_options[:id]}');" %>
<%= link_to project.name, open_source_project_path(project), :class => "project root leaf" %>
<span style="float: right;"></span>
<%= render :partial => 'open_source_projects/os_project_search_list', :locals => {:project => project, :i => i} %>
</div>

View File

@ -96,14 +96,14 @@
</ul>
</div>
<div id="relation_recommendation" style="margin-top: 20px;">
<!-- <div id="relation_recommendation" style="margin-top: 20px;">
<div class="recommondation_title">软件关联网络</div>
<div id="echarts_simility" style="width: 220px; height: 200px;">
</div>
<div id="echarts_relation" style="width: 220px; height: 200px;">
</div>
</div>
</div> -->
</div>

View File

@ -1,21 +1,21 @@
<!-- 全态势分析tab页 -->
<div class="tab-pane fade in active" id="tab2">
<h1 style="fontsize:19px;color:#E67E22;font-weight:900"><span style="color: #269AC9;font-weight:900">职位需求情况</span></h1>
<!-- <h1 style="fontsize:19px;color:#E67E22;font-weight:900"><span style="color: #269AC9;font-weight:900">职位需求情况</span></h1>
<div id="highchart_container1" style="width: 330px; height: 200px; float: left"></div>
<div id="highchart_container3" style="width: 330px; height: 200px; float: right"></div>
<div id="highchart_container3" style="width: 330px; height: 200px; float: right"></div> -->
<h1 style="fontsize:19px;color: #E67E22;font-weight:900; clear: both"><span style="color: #269AC9;font-weight:900">薪资情况</span></h1>
<h1 style="fontsize:19px;color: #E67E22;font-weight:900; clear: both"><span style="color: #269AC9;font-weight:900">工作地点分析</span></h1>
<div id="highchart_container2" style="width: 330px; height: 200px; float: left"></div>
<div id="highchart_container3" style="width: 330px; height: 200px; float: left"></div>
<div id="highchart_container4" style="width: 330px; height: 200px; float: right"></div>
<h1 style="fontsize:19px;color: #E67E22;font-weight:900; clear: both"><span style="color: #269AC9;font-weight:900">工作经验情况</span></h1>
<h1 style="fontsize:19px;color: #E67E22;font-weight:900; clear: both"><span style="color: #269AC9;font-weight:900">工作经验分析</span></h1>
<div id="highchart_container5" style="width: 330px; height: 200px; float: left"></div>
<div id="highchart_container6" style="width: 330px; height: 200px; float: right"></div>
<h1 style="fontsize:19px;color: #E67E22;font-weight:900; clear: both"><span style="color: #269AC9;font-weight:900">相关软件情况</span></h1>
<h1 style="fontsize:19px;color: #E67E22;font-weight:900; clear: both"><span style="color: #269AC9;font-weight:900">相关软件分析</span></h1>
<div id="highchart_container7" style="width: 330px; height: 300px; float: left"></div>
<div id="highchart_container8" style="width: 330px; height: 300px; float: right"></div>
@ -385,7 +385,12 @@
text: null
},
xAxis: {
categories: categories
categories: categories,
labels: {
formatter: function() {
return(this.value.length>20?this.value.substring(0,20) + "...":this.value);
}
}
},
yAxis: {
title: {
@ -465,6 +470,7 @@
'<%=osp.name.to_s %>',
<% end %>
],
data = [
<% for i in 0..@chart8_osp.length-1 %>
{
@ -507,7 +513,12 @@
text: null
},
xAxis: {
categories: categories
categories: categories,
labels: {
formatter: function() {
return(this.value.length>20?this.value.substring(0,20) + "...":this.value);
}
}
},
yAxis: {
title: {

View File

@ -282,7 +282,7 @@
<%###########################访问量统计代码##############################%>
<script type="text/javascript">
var cnzz_protocol = (("https:" == document.location.protocol) ? " https://" : " http://");
document.write(unescape("%3Cspan id='cnzz_stat_icon_1256970726'%3E%3C/span%3E%3Cscript src='" + cnzz_protocol + "s4.cnzz.com/z_stat.php%3Fid%3D1256970726' type='text/javascript'%3E%3C/script%3E"));
# var cnzz_protocol = (("https:" == document.location.protocol) ? " https://" : " http://");
# document.write(unescape("%3Cspan id='cnzz_stat_icon_1256970726'%3E%3C/span%3E%3Cscript src='" + cnzz_protocol + "s4.cnzz.com/z_stat.php%3Fid%3D1256970726' type='text/javascript'%3E%3C/script%3E"));
</script>

View File

@ -266,9 +266,9 @@
<p class="stats">
<!--%= link_to project_info.visit_num, open_source_project_path(project) %-->
<a>
<%= topic.view_num+topic.view_num_ossean %>
<!-- <%= topic.view_num+topic.view_num_ossean %> -->
</a>
<%= content_tag('span', l(:label_view_num)) %>
<!-- <%= content_tag('span', l(:label_view_num)) %> -->
</p>
<!--<p class="stats">-->
<!--%=link_to project.code_line %-->

View File

@ -7,18 +7,21 @@
// });
// })
function reload_tags(){
<% @tags = obj.reload.tag_list %>
<% #@tags = obj.reload.tag_list %>
<% @tags = obj.attributes["tags"]
if @tags != nil && @tags.length>2
@tags = @tags[1,@tags.length-2].split(">,<")
end %>
}
</script>
<!-- 1代表是user类型 2代表是project类型 3代表是issue类型 4代表需求-->
<% if object_flag == '8'%>
<%# @tags = obj.get_tag_list%>
<% @tags = obj.tags.limit(9)%>
<% elsif object_flag == '10' %>
<% @tags = obj.AllTags.limit(9) %>
<% else %>
<% @tags = obj.reload.tag_list %>
<% end%>
<% #@tags = obj.get_tag_list%>
<% @tags = obj.attributes["tags"]
if @tags != nil && @tags.length>2
@tags = @tags[1,@tags.length-2].split(">,<")
end
@tags = @tags[0,9]%>
<% if non_list_all and (@tags.size > 0) %>

View File

@ -1,53 +0,0 @@
source ~/.rvm/scripts/rvm# Default setup is given for MySQL with ruby1.9. If you're running Redmine
source ~/.rvm/scripts/rvm# Default setup is given for MySQL with ruby1.9. If you're running Redmine
# with MySQL and ruby1.8, replace the adapter name with `mysql`.
# Examples for PostgreSQL, SQLite3 and SQL Server can be found at the end.
# Line indentation must be 2 spaces (no tabs).
production:
adapter: mysql2
database: redmine
host: localhost
username: root
password: ""
encoding: utf8
development:
adapter: mysql2
database: redmine_development
host: 10.107.17.20
username: root
password: "1234"
encoding: utf8
# Warning: The database defined as "test" will be erased and
# re-generated from your development database when you run "rake".
# Do not set this db to the same as development or production.
test:
adapter: mysql2
database: redmine_test
host: 10.107.17.20
username: root
password: "1234"
encoding: utf8
# PostgreSQL configuration example
#production:
# adapter: postgresql
# database: redmine
# host: localhost
# username: postgres
# password: "postgres"
# SQLite3 configuration example
#production:
# adapter: sqlite3
# database: db/redmine.sqlite3
# SQL Server configuration example
#production:
# adapter: sqlserver
# database: redmine
# host: localhost
# username: jenkins
# password: jenkins

7
trustie2/config/puma.rb Normal file
View File

@ -0,0 +1,7 @@
environment "production"
bind "unix:///var/run/ossean.sock"
restart_command '/var/www/ossean/trustie2/restart_puma'
daemonize true
pidfile "/var/tmp/pids/puma.pid"
#stdout_redirect "/var/www/ossean/trustie2/log/stdout", "/var/www/ossean/trustie2/log/stderr"

View File

@ -0,0 +1,10 @@
class AddIndexToRelativeMemoToOpenSourceProjectsTables < ActiveRecord::Migration
def change
arr = (1..70).to_a
arr.each do |letter|
add_index "relative_memo_to_open_source_projects_#{letter}".to_sym, :osp_id, :name => "osp_id_#{letter}"
add_index "relative_memo_to_open_source_projects_#{letter}".to_sym, :memo_type, :name => "memo_type_#{letter}"
add_index "relative_memo_to_open_source_projects_#{letter}".to_sym, :created_time, :name => "created_time_#{letter}"
end
end
end

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.6 KiB