Merge branch 'master' of https://git.trustie.net/gcm3651/ossean
git filter
This commit is contained in:
commit
d7855fa22e
|
@ -6,6 +6,6 @@
|
|||
<entry key="ext_dict">ext.dic;</entry>
|
||||
|
||||
<!--用户可以在这里配置自己的扩展停止词字典-->
|
||||
<entry key="ext_stopwords">stopword_1.dic;code_keywords.dic;stopword.dic</entry>
|
||||
<entry key="ext_stopwords"></entry>
|
||||
|
||||
</properties>
|
|
@ -11,6 +11,11 @@
|
|||
http://www.springframework.org/schema/tx/spring-tx.xsd
|
||||
http://www.springframework.org/schema/context
|
||||
http://www.springframework.org/schema/context/spring-context-3.0.xsd">
|
||||
|
||||
<bean id="jdbcTemplate" class="org.springframework.jdbc.core.JdbcTemplate">
|
||||
<property name="dataSource" ref="dataSource"></property>
|
||||
</bean>
|
||||
<bean id="SpringApplicationContext" class="com.ossean.match.utils.ApplicationContextHelper"></bean>
|
||||
|
||||
<bean id="sqlSessionFactory" class="org.mybatis.spring.SqlSessionFactoryBean">
|
||||
<property name="dataSource" ref="dataSource" />
|
||||
|
|
|
@ -0,0 +1,9 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">
|
||||
<properties>
|
||||
<comment>TableFlow</comment>
|
||||
<entry key="startId">0</entry>
|
||||
<entry key="endId">200000</entry>
|
||||
<entry key="batchSize">5000</entry>
|
||||
<entry key="newComing">true</entry>
|
||||
</properties>
|
|
@ -50,8 +50,12 @@ public interface ProjectDao {
|
|||
@Param("endId") int endId);
|
||||
|
||||
// 批量获取项目,filtration为1表示保留,为2表示之前保留的且已处理,为0标识不保留;update_mark为null表示已处理或者无更新,不为null表示有更新
|
||||
@Select("select id,name,url,tags,synonyms from open_source_projects where id>#{startId} AND id<=#{endId} AND ((filtration = 1) or (update_mark is not null and filtration = 2)) limit #{batchSize}")
|
||||
public List<Project> getBatchPrjsIncre(@Param("startId") int startId,
|
||||
@Param("endId") int endId,@Param("batchSize") int batchSize);
|
||||
|
||||
@Select("select id,name,url,tags,synonyms from open_source_projects where (filtration = 1) or (update_mark is not null and filtration = 2) limit #{batchSize}")
|
||||
public List<Project> getBatchPrjsIncre(@Param("batchSize") int batchSize);
|
||||
public List<Project> getBatchPrjsIncreWithoutLimit(@Param("batchSize") int batchSize);
|
||||
|
||||
// 将已经匹配过的项目的update_mark和filtration设为null和2,表示已匹配
|
||||
@Update("update open_source_projects set update_mark = null,filtration = 2 where id = #{prjId}")
|
||||
|
|
|
@ -3,6 +3,7 @@ package com.ossean.match.dao;
|
|||
import java.util.List;
|
||||
|
||||
import org.apache.ibatis.annotations.Insert;
|
||||
import org.apache.ibatis.annotations.InsertProvider;
|
||||
import org.apache.ibatis.annotations.Param;
|
||||
import org.apache.ibatis.annotations.Select;
|
||||
|
||||
|
@ -17,5 +18,9 @@ public interface RelativeMemoToOpenSourceProjectDao {
|
|||
//写入匹配结果
|
||||
@Insert("insert into ${targetTableName} (osp_id,relative_memo_id,match_weight,match_time) values (#{item.osp_id},#{item.relative_memo_id},#{item.match_weight},now())")
|
||||
public void insertMatchResult(@Param("targetTableName") String targetTableName, @Param("item") RelativeMemoToOpenSourceProject item);
|
||||
|
||||
//写入匹配结果
|
||||
@InsertProvider(type = RelativeMemoToOpenSourceProjectDaoProvider.class, method = "insertAll")
|
||||
public void insertMatchResultBatch(@Param("targetTableName") String targetTableName, @Param("list") List<RelativeMemoToOpenSourceProject> list);
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,27 @@
|
|||
package com.ossean.match.dao;
|
||||
|
||||
import java.text.MessageFormat;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import com.ossean.match.model.RelativeMemoToOpenSourceProject;
|
||||
|
||||
public class RelativeMemoToOpenSourceProjectDaoProvider {
|
||||
public String insertAll(Map map) {
|
||||
List<RelativeMemoToOpenSourceProject> list = (List<RelativeMemoToOpenSourceProject>) map.get("list");
|
||||
String tableName = (String) map.get("targetTableName");
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append("INSERT INTO "+tableName+" ");
|
||||
sb.append("(osp_id,relative_memo_id,match_weight,match_time) ");
|
||||
sb.append("VALUES ");
|
||||
MessageFormat mf = new MessageFormat("(#'{'list[{0}].osp_id},#'{'list[{0}].relative_memo_id},#'{'list[{0}].match_weight},now())");
|
||||
for (int i = 0; i < list.size(); i++) {
|
||||
sb.append(mf.format(new Integer[]{i}));
|
||||
if (i < list.size() - 1) {
|
||||
sb.append(",");
|
||||
}
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
}
|
|
@ -55,11 +55,9 @@ public class Main {
|
|||
newPrjMonitor.run();
|
||||
prjToMemoMatch.run();
|
||||
memoToPrjMatch.run();
|
||||
if (matchIncrement.getSleepTime() > 0) {
|
||||
logger.info(".........sleeping.........." + matchIncrement.getSleepTime()/1000 + "s......");
|
||||
}
|
||||
Thread.sleep(matchIncrement.getSleepTime());
|
||||
|
||||
logger.info(".........sleeping.........." + 1000*60 + "s......");
|
||||
Thread.sleep(1000*60);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
@ -2,6 +2,10 @@ package com.ossean.match.matchprocess;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Paths;
|
||||
import java.text.MessageFormat;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
@ -12,6 +16,10 @@ import org.apache.lucene.store.Directory;
|
|||
import org.apache.lucene.store.FSDirectory;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.context.ApplicationContext;
|
||||
import org.springframework.context.support.ClassPathXmlApplicationContext;
|
||||
import org.springframework.dao.DataAccessException;
|
||||
import org.springframework.jdbc.core.JdbcTemplate;
|
||||
import org.springframework.stereotype.Component;
|
||||
import org.springframework.transaction.annotation.Propagation;
|
||||
import org.springframework.transaction.annotation.Transactional;
|
||||
|
@ -23,6 +31,7 @@ import com.ossean.match.lucene.LuceneSearch;
|
|||
import com.ossean.match.model.Project;
|
||||
import com.ossean.match.model.RelativeMemo;
|
||||
import com.ossean.match.model.RelativeMemoToOpenSourceProject;
|
||||
import com.ossean.match.utils.ApplicationContextHelper;
|
||||
import com.ossean.match.utils.Normalizer;
|
||||
|
||||
import javax.annotation.Resource;
|
||||
|
@ -75,10 +84,12 @@ public class Match {
|
|||
// 项目名匹配帖子标题
|
||||
matchMap = LuceneSearch.prjToMemoMatchByLucene(prjName,
|
||||
LuceneIndex.titleFieldName, titleWeight, matchMap, memoIndexReader);
|
||||
//System.out.println("项目名匹配帖子标题"+matchMap.size());
|
||||
// 项目名匹配帖子标签
|
||||
matchMap = LuceneSearch.searchMemoTags(
|
||||
prjName, LuceneIndex.memoTagsFieldName, LuceneIndex.memoIdFieldName, memoTagsWeight,
|
||||
matchMap, memoIndexReader);
|
||||
//System.out.println("项目名匹配帖子标签"+matchMap.size());
|
||||
}
|
||||
if (project.getSynonyms() != null) {
|
||||
synonyms = project.getSynonyms().toLowerCase();
|
||||
|
@ -89,12 +100,14 @@ public class Match {
|
|||
matchMap = LuceneSearch.prjToMemoMatchByLucene(synonym,
|
||||
LuceneIndex.titleFieldName, titleWeight, matchMap, memoIndexReader);
|
||||
}
|
||||
//System.out.println("项目别名匹配帖子标题 "+matchMap.size());
|
||||
//项目别名匹配帖子标签 TODO
|
||||
for (String synonym : synonymsList) {
|
||||
matchMap = LuceneSearch.searchMemoTags(
|
||||
synonym, LuceneIndex.memoTagsFieldName, LuceneIndex.memoIdFieldName, memoTagsWeight,
|
||||
matchMap, memoIndexReader);
|
||||
}
|
||||
//System.out.println("项目别名匹配帖子标签 "+matchMap.size());
|
||||
}
|
||||
}
|
||||
if (prjTags != null && prjTags.length() > 0 && matchMap.size() > 0) {
|
||||
|
@ -105,12 +118,14 @@ public class Match {
|
|||
prjTag, LuceneIndex.titleFieldName, LuceneIndex.memoIdFieldName, prjTagsToMemoTitleWeight,
|
||||
matchMap, memoIndexReader);
|
||||
}
|
||||
//System.out.println("用项目标签搜索帖子标题 "+matchMap.size());
|
||||
// 用项目标签搜索帖子标签
|
||||
for (String prjTag : prjTagsList) {
|
||||
matchMap = LuceneSearch.searchByPrjTag(
|
||||
prjTag, LuceneIndex.memoTagsFieldName, LuceneIndex.memoIdFieldName, prjTagsToMemoTagsWeight,
|
||||
matchMap, memoIndexReader);
|
||||
}
|
||||
//System.out.println("用项目标签搜索帖子标签 "+matchMap.size());
|
||||
}
|
||||
if (matchMap.size() > 0)
|
||||
insertPrjToMemoMatchResult(prjId, matchMap);
|
||||
|
@ -132,6 +147,8 @@ public class Match {
|
|||
*/
|
||||
@Transactional(propagation=Propagation.REQUIRES_NEW)
|
||||
public void insertPrjToMemoMatchResult(int prjId, HashMap<Integer, Double> matchMap) {
|
||||
//long start = System.currentTimeMillis();
|
||||
List<RelativeMemoToOpenSourceProject> list = new ArrayList<RelativeMemoToOpenSourceProject>();
|
||||
for (Map.Entry<Integer, Double> entry : matchMap.entrySet()) {
|
||||
try {
|
||||
int memoId = entry.getKey();
|
||||
|
@ -141,12 +158,38 @@ public class Match {
|
|||
matchResult.setOsp_id(prjId);
|
||||
matchResult.setRelative_memo_id(memoId);
|
||||
matchResult.setMatch_weight(weight);
|
||||
matchResultDao.insertMatchResult(getTargetTable(prjId), matchResult);
|
||||
list.add(matchResult);
|
||||
|
||||
//matchResultDao.insertMatchResult(getTargetTable(prjId), matchResult);
|
||||
}
|
||||
if(list.size()==800 || list.size()==matchMap.size()){
|
||||
batchInsertJDBC(list,getTargetTable(prjId));
|
||||
//matchResultDao.insertMatchResultBatch(getTargetTable(prjId), list);
|
||||
list.clear();
|
||||
}
|
||||
|
||||
} catch (Exception e) {
|
||||
logger.error("insertPrjToMemoMatchResult error: " + e);
|
||||
}
|
||||
}
|
||||
//System.out.println(prjId+" current insert time cost:"+(System.currentTimeMillis()-start)/1000+" seconds");
|
||||
}
|
||||
|
||||
@Transactional()
|
||||
public void batchInsertJDBC(List<RelativeMemoToOpenSourceProject> list,String tableName) throws DataAccessException {
|
||||
|
||||
JdbcTemplate jdbcTemplate=(JdbcTemplate)ApplicationContextHelper.getBean("jdbcTemplate");
|
||||
StringBuilder sqlbuf = new StringBuilder()
|
||||
.append("insert into "+tableName+" (osp_id,relative_memo_id,match_weight,match_time) values ");
|
||||
for (RelativeMemoToOpenSourceProject prj :list) {
|
||||
SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");//设置日期格式
|
||||
sqlbuf.append("("+prj.getOsp_id()+","+prj.getRelative_memo_id()+","
|
||||
+prj.getMatch_weight()+",'"+df.format(new Date())+"')");
|
||||
sqlbuf.append(",");
|
||||
}
|
||||
String sql = sqlbuf.toString();
|
||||
sql = sql.substring(0, sql.length()-1);
|
||||
jdbcTemplate.update(sql);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -163,39 +206,8 @@ public class Match {
|
|||
int a = 1 + ospId/11000;
|
||||
targetTableName = "relative_memo_to_open_source_projects_" + a;
|
||||
}
|
||||
// if (osp_id < 500) {
|
||||
// targetTableName = "relative_memo_to_open_source_projects_1";
|
||||
// }
|
||||
// else if (osp_id >= 500 && osp_id < 1000) {
|
||||
// targetTableName = "relative_memo_to_open_source_projects_2";
|
||||
// }
|
||||
// else if (osp_id >= 1000 && osp_id < 1500) {
|
||||
// targetTableName = "relative_memo_to_open_source_projects_3";
|
||||
// }
|
||||
// else if (osp_id >= 1500 && osp_id < 2000) {
|
||||
// targetTableName = "relative_memo_to_open_source_projects_4";
|
||||
// }
|
||||
// else if (osp_id >= 2000 && osp_id < 3000) {
|
||||
// targetTableName = "relative_memo_to_open_source_projects_5";
|
||||
// }
|
||||
// else if (osp_id >= 3000 && osp_id < 5000) {
|
||||
// targetTableName = "relative_memo_to_open_source_projects_6";
|
||||
// }
|
||||
// else if (osp_id >= 5000 && osp_id < 7500) {
|
||||
// targetTableName = "relative_memo_to_open_source_projects_7";
|
||||
// }
|
||||
// else if (osp_id >= 7500 && osp_id < 10000) {
|
||||
// targetTableName = "relative_memo_to_open_source_projects_8";
|
||||
// }
|
||||
// else if (osp_id >= 10000 && osp_id < 310000) {
|
||||
// int a = 7 + osp_id/5000;
|
||||
// targetTableName = "relative_memo_to_open_source_projects_" + a;
|
||||
// }
|
||||
// else if (osp_id >= 310000) {
|
||||
// targetTableName = "relative_memo_to_open_source_projects_70";
|
||||
// }
|
||||
|
||||
return targetTableName;
|
||||
//return "relative_memo_to_open_source_projects";
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -229,9 +241,9 @@ public class Match {
|
|||
memoTags = memo.getTags().toLowerCase();
|
||||
}
|
||||
memoTitle = memo.getTitle().toLowerCase();
|
||||
if (memoTitle.length() == 0) {
|
||||
continue;
|
||||
}
|
||||
// if (memoTitle.length() == 0) {
|
||||
// continue;
|
||||
// }
|
||||
if (Normalizer.isAllNumber(memoTitle)) {
|
||||
continue;
|
||||
}
|
||||
|
|
|
@ -13,56 +13,16 @@ public class MatchIncrement extends Thread implements Runnable {
|
|||
private Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private int memoHistory; // the record of memo after matching last time
|
||||
private static File record; // record file "record.txt"
|
||||
private boolean newPrjComing; //新项目进入标识
|
||||
private boolean newMemoComing; //新帖子进入标识
|
||||
private boolean currentMatchDone = true; //当前批量匹配是否结束
|
||||
private boolean newPrjComing; // 新项目进入标识
|
||||
private boolean newMemoComing; // 新帖子进入标识
|
||||
private boolean currentMatchDone = true; // 当前批量匹配是否结束
|
||||
private int sleepTime = 0;
|
||||
|
||||
public boolean isNewPrjComing() {
|
||||
return newPrjComing;
|
||||
}
|
||||
|
||||
public void setNewPrjComing(boolean newPrjComing) {
|
||||
this.newPrjComing = newPrjComing;
|
||||
}
|
||||
|
||||
public boolean isNewMemoComing() {
|
||||
return newMemoComing;
|
||||
}
|
||||
|
||||
public void setNewMemoComing(boolean newMemoComing) {
|
||||
this.newMemoComing = newMemoComing;
|
||||
}
|
||||
|
||||
public int getSleepTime() {
|
||||
return sleepTime;
|
||||
}
|
||||
|
||||
public void setSleepTime(int sleepTime) {
|
||||
this.sleepTime = sleepTime;
|
||||
}
|
||||
|
||||
public MatchIncrement() {
|
||||
record = new File("record.txt");
|
||||
getMatchRecord();
|
||||
}
|
||||
|
||||
public int getMemoHistory() {
|
||||
return memoHistory;
|
||||
}
|
||||
|
||||
public synchronized void setMemoHistory(int memoHistory) {
|
||||
this.memoHistory = memoHistory;
|
||||
}
|
||||
|
||||
public boolean isCurrentMatchDone() {
|
||||
return currentMatchDone;
|
||||
}
|
||||
|
||||
public synchronized void setCurrentMatchDone(boolean currentMatchDone) {
|
||||
this.currentMatchDone = currentMatchDone;
|
||||
}
|
||||
|
||||
/**
|
||||
* read the record after matching last time from record file "record.txt",
|
||||
* the format of the record in the file: "project's record" "memo's record"
|
||||
|
@ -91,4 +51,45 @@ public class MatchIncrement extends Thread implements Runnable {
|
|||
logger.error("writeIntoRecord FileNotFoundException: " + e);
|
||||
}
|
||||
}
|
||||
|
||||
public boolean isNewPrjComing() {
|
||||
return newPrjComing;
|
||||
}
|
||||
|
||||
public void setNewPrjComing(boolean newPrjComing) {
|
||||
this.newPrjComing = newPrjComing;
|
||||
}
|
||||
|
||||
public boolean isNewMemoComing() {
|
||||
return newMemoComing;
|
||||
}
|
||||
|
||||
public void setNewMemoComing(boolean newMemoComing) {
|
||||
this.newMemoComing = newMemoComing;
|
||||
}
|
||||
|
||||
public int getSleepTime() {
|
||||
return sleepTime;
|
||||
}
|
||||
|
||||
public void setSleepTime(int sleepTime) {
|
||||
this.sleepTime = sleepTime;
|
||||
}
|
||||
|
||||
public int getMemoHistory() {
|
||||
return memoHistory;
|
||||
}
|
||||
|
||||
public synchronized void setMemoHistory(int memoHistory) {
|
||||
this.memoHistory = memoHistory;
|
||||
}
|
||||
|
||||
public boolean isCurrentMatchDone() {
|
||||
return currentMatchDone;
|
||||
}
|
||||
|
||||
public synchronized void setCurrentMatchDone(boolean currentMatchDone) {
|
||||
this.currentMatchDone = currentMatchDone;
|
||||
}
|
||||
|
||||
}
|
|
@ -14,6 +14,7 @@ import org.springframework.stereotype.Component;
|
|||
|
||||
import com.ossean.match.dao.ProjectDao;
|
||||
import com.ossean.match.lucene.LuceneIndex;
|
||||
import com.ossean.match.model.Configure;
|
||||
import com.ossean.match.model.Project;
|
||||
|
||||
@Component("prjtomemomatch")
|
||||
|
@ -26,16 +27,25 @@ public class PrjToMemoMatch implements Runnable {
|
|||
@Qualifier("match")
|
||||
@Autowired
|
||||
private Match match;
|
||||
|
||||
private Configure conf;
|
||||
private String configureName = "configure";
|
||||
|
||||
public void setMatchIncrement(MatchIncrement matchIncrement) {
|
||||
this.matchIncrement = matchIncrement;
|
||||
}
|
||||
|
||||
public void prjToMemoIncrement() throws IOException, SQLException {
|
||||
public void prjToMemoIncrement(boolean isnew) throws IOException, SQLException {
|
||||
matchIncrement.setCurrentMatchDone(false);
|
||||
matchIncrement.getMatchRecord();
|
||||
|
||||
conf = new Configure(configureName);
|
||||
List<Project> newPrjs;
|
||||
// 批量读取filtration为1或者filtration为2且update_mark不为空的项目。即未匹配的和已匹配但有更新的。
|
||||
List<Project> newPrjs = projectDao.getBatchPrjsIncre(step);
|
||||
if(isnew)
|
||||
newPrjs = projectDao.getBatchPrjsIncreWithoutLimit(step);
|
||||
else
|
||||
newPrjs = projectDao.getBatchPrjsIncre(conf.getStartId(),conf.getEndId(),conf.getBatchSize());
|
||||
while (newPrjs != null && newPrjs.size() > 0) {
|
||||
if (newPrjs != null) {
|
||||
match.prjToMemoMatch(newPrjs);
|
||||
|
@ -44,16 +54,33 @@ public class PrjToMemoMatch implements Runnable {
|
|||
luceneIndex.prjIndexIncrement(newPrjs);
|
||||
}
|
||||
// getting projects' id and prjName in batches
|
||||
newPrjs = projectDao.getBatchPrjsIncre(step);
|
||||
if(isnew)
|
||||
newPrjs = projectDao.getBatchPrjsIncreWithoutLimit(step);
|
||||
else
|
||||
newPrjs = projectDao.getBatchPrjsIncre(conf.getStartId(),conf.getEndId(),conf.getBatchSize());
|
||||
}
|
||||
matchIncrement.setCurrentMatchDone(true);
|
||||
}
|
||||
|
||||
public void run() {
|
||||
if (matchIncrement.isNewPrjComing()) {
|
||||
logger.info("start osps_to_relative_memos match!!!");
|
||||
conf = new Configure(configureName);
|
||||
boolean newcoming = conf.isNewComing();
|
||||
if (newcoming) {//对于正常匹配程序,处理新数据流
|
||||
if(matchIncrement.isNewPrjComing()){
|
||||
logger.info("start osps_to_relative_memos match for new prj");
|
||||
try {
|
||||
prjToMemoIncrement(true);
|
||||
} catch (IOException e) {
|
||||
logger.error("IOException: " + e);
|
||||
} catch (SQLException e) {
|
||||
logger.error("SQLException: " + e);
|
||||
}
|
||||
}
|
||||
}
|
||||
else{
|
||||
logger.info("start osps_to_relative_memos match with multi thread");
|
||||
try {
|
||||
prjToMemoIncrement();
|
||||
prjToMemoIncrement(false);
|
||||
} catch (IOException e) {
|
||||
logger.error("IOException: " + e);
|
||||
} catch (SQLException e) {
|
||||
|
|
|
@ -0,0 +1,98 @@
|
|||
package com.ossean.match.model;
|
||||
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.net.URL;
|
||||
import java.util.ArrayList;
|
||||
import java.util.InvalidPropertiesFormatException;
|
||||
import java.util.List;
|
||||
import java.util.Properties;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
public class Configure {
|
||||
|
||||
private Properties prop = new Properties();
|
||||
Logger log4j = Logger.getLogger(Configure.class);
|
||||
private int startId;
|
||||
private int endId;
|
||||
private int batchSize;
|
||||
private boolean newComing;
|
||||
|
||||
//自定义构造函数
|
||||
public Configure(String confPath) {
|
||||
confPath = confPath + ".xml";
|
||||
try {
|
||||
URL url = ClassLoader.getSystemResource(confPath);
|
||||
InputStream is = url.openStream();
|
||||
prop.loadFromXML(is);
|
||||
} catch (FileNotFoundException e) {
|
||||
log4j.error("Configure FileNotFoundException !");
|
||||
e.printStackTrace();
|
||||
} catch (InvalidPropertiesFormatException e) {
|
||||
e.printStackTrace();
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
config();
|
||||
}
|
||||
|
||||
private void config() {
|
||||
this.startId = Integer.parseInt(prop.getProperty("startId"));
|
||||
this.endId = Integer.parseInt(prop.getProperty("endId"));
|
||||
this.batchSize = Integer.parseInt(prop.getProperty("batchSize"));
|
||||
this.newComing = Boolean.parseBoolean(prop.getProperty("newComing"));
|
||||
|
||||
}
|
||||
|
||||
public Properties getProp() {
|
||||
return prop;
|
||||
}
|
||||
public void setProp(Properties prop) {
|
||||
this.prop = prop;
|
||||
}
|
||||
public Logger getLog4j() {
|
||||
return log4j;
|
||||
}
|
||||
public void setLog4j(Logger log4j) {
|
||||
this.log4j = log4j;
|
||||
}
|
||||
|
||||
public int getStartId() {
|
||||
return startId;
|
||||
}
|
||||
|
||||
public void setStartId(int startId) {
|
||||
this.startId = startId;
|
||||
}
|
||||
|
||||
public int getEndId() {
|
||||
return endId;
|
||||
}
|
||||
|
||||
public void setEndId(int endId) {
|
||||
this.endId = endId;
|
||||
}
|
||||
|
||||
public int getBatchSize() {
|
||||
return batchSize;
|
||||
}
|
||||
|
||||
public void setBatchSize(int batchSize) {
|
||||
this.batchSize = batchSize;
|
||||
}
|
||||
|
||||
public boolean isNewComing() {
|
||||
return newComing;
|
||||
}
|
||||
|
||||
public void setNewComing(boolean newComing) {
|
||||
this.newComing = newComing;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
|
@ -0,0 +1,36 @@
|
|||
package com.ossean.match.utils;
|
||||
|
||||
import org.springframework.beans.BeansException;
|
||||
import org.springframework.context.ApplicationContext;
|
||||
import org.springframework.context.ApplicationContextAware;
|
||||
|
||||
public class ApplicationContextHelper implements ApplicationContextAware{
|
||||
private static ApplicationContext appCtx;
|
||||
/**
|
||||
* 此方法可以把ApplicationContext对象inject到当前类中作为一个静态成员变量。
|
||||
* @param applicationContext ApplicationContext 对象.
|
||||
* @throws BeansException
|
||||
*/
|
||||
@Override
|
||||
public void setApplicationContext( ApplicationContext applicationContext ) throws BeansException {
|
||||
appCtx = applicationContext;
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取ApplicationContext
|
||||
* @return
|
||||
*/
|
||||
public static ApplicationContext getApplicationContext(){
|
||||
return appCtx;
|
||||
}
|
||||
|
||||
/**
|
||||
* 这是一个便利的方法,帮助我们快速得到一个BEAN
|
||||
* @param beanName bean的名字
|
||||
* @return 返回一个bean对象
|
||||
*/
|
||||
public static Object getBean(String beanName) {
|
||||
return appCtx.getBean(beanName);
|
||||
}
|
||||
|
||||
}
|
|
@ -64,15 +64,17 @@ public class Normalizer {
|
|||
}
|
||||
|
||||
public static void main(String []args) {
|
||||
String content = "大声道Corvette123 *&%9&^&%&^%(*_asd's_12 L'Aur--ore, 1766-1775";
|
||||
int a = 1 + 11000/11000;
|
||||
String targetTableName = "relative_memo_to_open_source_projects_" + a;
|
||||
//System.out.println(StringFilter(content));
|
||||
String result = content.replaceAll("[^0-9a-zA-Z]"," ");
|
||||
System.out.println(result);
|
||||
System.out.println(normalize(content));
|
||||
System.out.println(content.lastIndexOf(" "));
|
||||
//System.out.println(StringFilter(content));
|
||||
// String content = "大声道Corvette123 *&%9&^&%&^%(*_asd's_12 L'Aur--ore, 1766-1775";
|
||||
// int a = 1 + 11000/11000;
|
||||
// String targetTableName = "relative_memo_to_open_source_projects_" + a;
|
||||
// //System.out.println(StringFilter(content));
|
||||
// String result = content.replaceAll("[^0-9a-zA-Z]"," ");
|
||||
// System.out.println(result);
|
||||
// System.out.println(normalize(content));
|
||||
// System.out.println(content.lastIndexOf(" "));
|
||||
// //System.out.println(StringFilter(content));
|
||||
String a = "'One' PHP cache";
|
||||
System.out.println(getList(a));
|
||||
}
|
||||
/**
|
||||
* if a string only contains digital
|
||||
|
|
|
@ -6,6 +6,6 @@
|
|||
<entry key="ext_dict">ext.dic;</entry>
|
||||
|
||||
<!--用户可以在这里配置自己的扩展停止词字典-->
|
||||
<entry key="ext_stopwords">stopword_1.dic;code_keywords.dic</entry>
|
||||
<entry key="ext_stopwords"></entry>
|
||||
|
||||
</properties>
|
|
@ -11,6 +11,11 @@
|
|||
http://www.springframework.org/schema/tx/spring-tx.xsd
|
||||
http://www.springframework.org/schema/context
|
||||
http://www.springframework.org/schema/context/spring-context-3.0.xsd">
|
||||
|
||||
<bean id="jdbcTemplate" class="org.springframework.jdbc.core.JdbcTemplate">
|
||||
<property name="dataSource" ref="dataSource"></property>
|
||||
</bean>
|
||||
<bean id="SpringApplicationContext" class="com.ossean.match.utils.ApplicationContextHelper"></bean>
|
||||
|
||||
<bean id="sqlSessionFactory" class="org.mybatis.spring.SqlSessionFactoryBean">
|
||||
<property name="dataSource" ref="dataSource" />
|
||||
|
@ -27,10 +32,10 @@
|
|||
class="org.springframework.jdbc.datasource.DataSourceTransactionManager">
|
||||
<property name="dataSource" ref="dataSource" />
|
||||
</bean>
|
||||
|
||||
|
||||
<!-- enable transaction annotation support -->
|
||||
<tx:annotation-driven transaction-manager="txManager" />
|
||||
|
||||
|
||||
|
||||
<bean id="dataSource" class="org.apache.commons.dbcp.BasicDataSource"
|
||||
destroy-method="close">
|
||||
|
|
|
@ -0,0 +1,697 @@
|
|||
private
|
||||
protected
|
||||
public
|
||||
abstract
|
||||
class
|
||||
extends
|
||||
final
|
||||
implements
|
||||
interface
|
||||
native
|
||||
new
|
||||
static
|
||||
strictfp
|
||||
synchronized
|
||||
transient
|
||||
volatile
|
||||
break
|
||||
continue
|
||||
return
|
||||
do
|
||||
while
|
||||
if
|
||||
else
|
||||
for
|
||||
instanceof
|
||||
switch
|
||||
case
|
||||
defult
|
||||
catch
|
||||
finally
|
||||
throw
|
||||
throws
|
||||
try
|
||||
import
|
||||
package
|
||||
boolean
|
||||
byte
|
||||
char
|
||||
double
|
||||
float
|
||||
int
|
||||
long
|
||||
short
|
||||
null
|
||||
true
|
||||
false
|
||||
super
|
||||
this
|
||||
void
|
||||
form
|
||||
do
|
||||
sourceforge
|
||||
com
|
||||
cn
|
||||
www
|
||||
http
|
||||
js
|
||||
script
|
||||
a
|
||||
test
|
||||
error
|
||||
exception
|
||||
about
|
||||
website
|
||||
able
|
||||
abstract
|
||||
console
|
||||
sql
|
||||
windows
|
||||
exe
|
||||
txt
|
||||
doc
|
||||
xls
|
||||
local
|
||||
net
|
||||
web
|
||||
server
|
||||
from
|
||||
using
|
||||
error
|
||||
can
|
||||
date
|
||||
file
|
||||
xml
|
||||
can
|
||||
system
|
||||
url
|
||||
value
|
||||
create
|
||||
text
|
||||
set
|
||||
get
|
||||
list
|
||||
table
|
||||
select
|
||||
distinct
|
||||
object
|
||||
open
|
||||
close
|
||||
clear
|
||||
all
|
||||
time
|
||||
have
|
||||
org
|
||||
main
|
||||
start
|
||||
end
|
||||
version
|
||||
private
|
||||
public
|
||||
index
|
||||
api
|
||||
method
|
||||
source
|
||||
root
|
||||
content
|
||||
write
|
||||
read
|
||||
view
|
||||
one
|
||||
page
|
||||
run
|
||||
log
|
||||
win
|
||||
bin
|
||||
first
|
||||
next
|
||||
src
|
||||
app
|
||||
request
|
||||
println
|
||||
namespace
|
||||
printf
|
||||
button
|
||||
title
|
||||
local
|
||||
define
|
||||
lib
|
||||
make
|
||||
files
|
||||
help
|
||||
article
|
||||
bool
|
||||
boolean
|
||||
config
|
||||
load
|
||||
args
|
||||
date
|
||||
thread
|
||||
cpu
|
||||
more
|
||||
document
|
||||
util
|
||||
info
|
||||
home
|
||||
style
|
||||
body
|
||||
print
|
||||
hello
|
||||
world
|
||||
find
|
||||
left
|
||||
std
|
||||
debug
|
||||
etc
|
||||
like
|
||||
top
|
||||
now
|
||||
map
|
||||
context
|
||||
other
|
||||
post
|
||||
format
|
||||
client
|
||||
encoding
|
||||
session
|
||||
program
|
||||
datebase
|
||||
control
|
||||
language
|
||||
base
|
||||
process
|
||||
want
|
||||
values
|
||||
used
|
||||
status
|
||||
project
|
||||
color
|
||||
array
|
||||
loaclhost
|
||||
please
|
||||
click
|
||||
integer
|
||||
example
|
||||
archive
|
||||
response
|
||||
command
|
||||
event
|
||||
param
|
||||
what
|
||||
core
|
||||
build
|
||||
link
|
||||
display
|
||||
copy
|
||||
stdio
|
||||
release
|
||||
mode
|
||||
after
|
||||
check
|
||||
cpp
|
||||
c
|
||||
port
|
||||
target
|
||||
should
|
||||
library
|
||||
lib
|
||||
tcp
|
||||
udp
|
||||
state
|
||||
sum
|
||||
free
|
||||
last
|
||||
connect
|
||||
configuration
|
||||
none
|
||||
download
|
||||
software
|
||||
buffer
|
||||
query
|
||||
bit
|
||||
temp
|
||||
word
|
||||
block
|
||||
two
|
||||
send
|
||||
device
|
||||
layout
|
||||
sdk
|
||||
cache
|
||||
alert
|
||||
see
|
||||
change
|
||||
work
|
||||
kernel
|
||||
filter
|
||||
handel
|
||||
save
|
||||
tools
|
||||
min
|
||||
count
|
||||
header
|
||||
level
|
||||
framework
|
||||
sys
|
||||
email
|
||||
push
|
||||
pull
|
||||
heap
|
||||
stack
|
||||
user
|
||||
must
|
||||
demo
|
||||
just
|
||||
serach
|
||||
res
|
||||
tmp
|
||||
down
|
||||
option
|
||||
img
|
||||
network
|
||||
model
|
||||
stop
|
||||
problem
|
||||
position
|
||||
node
|
||||
space
|
||||
login
|
||||
font
|
||||
todo
|
||||
background
|
||||
resource
|
||||
mac
|
||||
bytes
|
||||
ios
|
||||
empty
|
||||
note
|
||||
self
|
||||
tag
|
||||
column
|
||||
studio
|
||||
red
|
||||
done
|
||||
baidu
|
||||
sleep
|
||||
convert
|
||||
global
|
||||
field
|
||||
reference
|
||||
way
|
||||
settings
|
||||
simple
|
||||
ctrl
|
||||
wait
|
||||
meta
|
||||
edit
|
||||
runtime
|
||||
store
|
||||
equals
|
||||
services
|
||||
bean
|
||||
ftp
|
||||
exec
|
||||
non
|
||||
enter
|
||||
lock
|
||||
admin
|
||||
configure
|
||||
png
|
||||
jpg
|
||||
abc
|
||||
loop
|
||||
vector
|
||||
setup
|
||||
mail
|
||||
flag
|
||||
machine
|
||||
sample
|
||||
share
|
||||
image
|
||||
ref
|
||||
join
|
||||
where
|
||||
parameter
|
||||
users
|
||||
contain
|
||||
know
|
||||
template
|
||||
class
|
||||
math
|
||||
pdf
|
||||
parse
|
||||
timeout
|
||||
cat
|
||||
through
|
||||
basic
|
||||
invoke
|
||||
resources
|
||||
media
|
||||
uri
|
||||
paltform
|
||||
err
|
||||
sudo
|
||||
range
|
||||
menu
|
||||
mapping
|
||||
tables
|
||||
master
|
||||
memset
|
||||
reset
|
||||
environment
|
||||
pop
|
||||
commit
|
||||
report
|
||||
task
|
||||
active
|
||||
domain
|
||||
good
|
||||
step
|
||||
setting
|
||||
day
|
||||
hash
|
||||
move
|
||||
collections
|
||||
phone
|
||||
engine
|
||||
callback
|
||||
datetime
|
||||
objects
|
||||
mobile
|
||||
profile
|
||||
always
|
||||
storage
|
||||
alt
|
||||
trim
|
||||
applications
|
||||
via
|
||||
design
|
||||
nothing
|
||||
makefile
|
||||
clean
|
||||
small
|
||||
solution
|
||||
custom
|
||||
people
|
||||
projects
|
||||
family
|
||||
attributes
|
||||
enum
|
||||
own
|
||||
works
|
||||
messages
|
||||
low
|
||||
signal
|
||||
browser
|
||||
give
|
||||
aaa
|
||||
event
|
||||
assert
|
||||
look
|
||||
company
|
||||
person
|
||||
either
|
||||
case
|
||||
desktop
|
||||
touch
|
||||
book
|
||||
unit
|
||||
docs
|
||||
params
|
||||
dump
|
||||
pool
|
||||
feature
|
||||
hashmap
|
||||
seconds
|
||||
owner
|
||||
goto
|
||||
keys
|
||||
editor
|
||||
env
|
||||
month
|
||||
year
|
||||
loading
|
||||
changes
|
||||
black
|
||||
fetch
|
||||
guide
|
||||
enterprise
|
||||
multi
|
||||
foundation
|
||||
reflect
|
||||
unique
|
||||
upload
|
||||
play
|
||||
idea
|
||||
exist
|
||||
elements
|
||||
account
|
||||
numbers
|
||||
examples
|
||||
plain
|
||||
solid
|
||||
game
|
||||
hide
|
||||
apps
|
||||
symbol
|
||||
layer
|
||||
least
|
||||
weight
|
||||
beta
|
||||
jpeg
|
||||
attr
|
||||
refresh
|
||||
apply
|
||||
focus
|
||||
fun
|
||||
cfg
|
||||
threads
|
||||
receive
|
||||
recieved
|
||||
easy
|
||||
hard
|
||||
mode
|
||||
logs
|
||||
password
|
||||
passwd
|
||||
packages
|
||||
ignore
|
||||
scripts
|
||||
cell
|
||||
later
|
||||
plus
|
||||
his
|
||||
pub
|
||||
going
|
||||
logging
|
||||
channel
|
||||
think
|
||||
serial
|
||||
things
|
||||
loader
|
||||
player
|
||||
libs
|
||||
team
|
||||
backup
|
||||
conector
|
||||
side
|
||||
utf8
|
||||
speed
|
||||
doing
|
||||
price
|
||||
timestamp
|
||||
repository
|
||||
init
|
||||
HelloWorld
|
||||
tesing
|
||||
scan
|
||||
components
|
||||
speed
|
||||
unlock
|
||||
products
|
||||
mid
|
||||
said
|
||||
token
|
||||
optional
|
||||
today
|
||||
rest
|
||||
pack
|
||||
coding
|
||||
contain
|
||||
points
|
||||
days
|
||||
safe
|
||||
typeof
|
||||
community
|
||||
others
|
||||
manual
|
||||
sets
|
||||
closed
|
||||
fine
|
||||
groups
|
||||
specify
|
||||
sources
|
||||
workder
|
||||
problems
|
||||
perform
|
||||
servers
|
||||
articles
|
||||
skip
|
||||
blocks
|
||||
connections
|
||||
related
|
||||
notify
|
||||
onload
|
||||
known
|
||||
assembly
|
||||
useful
|
||||
monitor
|
||||
thing
|
||||
forum
|
||||
come
|
||||
notification
|
||||
rules
|
||||
developers
|
||||
cross
|
||||
notice
|
||||
soft
|
||||
progress
|
||||
fork
|
||||
hosts
|
||||
logger
|
||||
role
|
||||
everything
|
||||
future
|
||||
references
|
||||
jni
|
||||
locale
|
||||
follow
|
||||
words
|
||||
updates
|
||||
necessary
|
||||
writer
|
||||
mouse
|
||||
sign
|
||||
repoter
|
||||
containing
|
||||
route
|
||||
analysis
|
||||
four
|
||||
comments
|
||||
programs
|
||||
maps
|
||||
turn
|
||||
versions
|
||||
drive
|
||||
parser
|
||||
track
|
||||
money
|
||||
utils
|
||||
reload
|
||||
decimal
|
||||
resume
|
||||
behavior
|
||||
hand
|
||||
board
|
||||
light
|
||||
regex
|
||||
terminal
|
||||
reduce
|
||||
requests
|
||||
fff
|
||||
0xfffff
|
||||
strong
|
||||
edge
|
||||
schedule
|
||||
pair
|
||||
swf
|
||||
tom
|
||||
dependency
|
||||
rule
|
||||
escape
|
||||
hook
|
||||
temporary
|
||||
issues
|
||||
exchange
|
||||
review
|
||||
various
|
||||
life
|
||||
friend
|
||||
face
|
||||
complex
|
||||
readme
|
||||
prop
|
||||
expert
|
||||
extend
|
||||
often
|
||||
automatic
|
||||
capture
|
||||
difference
|
||||
scheme
|
||||
develop
|
||||
together
|
||||
steps
|
||||
reply
|
||||
beginning
|
||||
understand
|
||||
identified
|
||||
wall
|
||||
along
|
||||
market
|
||||
minute
|
||||
hour
|
||||
university
|
||||
plan
|
||||
templates
|
||||
codes
|
||||
ccc
|
||||
diff
|
||||
ppt
|
||||
study
|
||||
abcd
|
||||
lost
|
||||
choice
|
||||
explain
|
||||
likely
|
||||
guid
|
||||
taken
|
||||
period
|
||||
FAQ
|
||||
symbols
|
||||
thus
|
||||
tips
|
||||
compute
|
||||
school
|
||||
languages
|
||||
describe
|
||||
resolution
|
||||
deal
|
||||
detect
|
||||
yellow
|
||||
hope
|
||||
visit
|
||||
xyz
|
||||
iii
|
||||
dog
|
||||
jump
|
||||
knowledge
|
||||
publish
|
||||
inserted
|
||||
levels
|
||||
chip
|
||||
preference
|
||||
done
|
||||
adjust
|
||||
earth
|
||||
expressions
|
||||
house
|
||||
central
|
||||
rich
|
||||
chat
|
||||
sessions
|
||||
deep
|
||||
when
|
||||
where
|
||||
how
|
||||
who
|
||||
仅供参考
|
|
@ -0,0 +1,9 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">
|
||||
<properties>
|
||||
<comment>TableFlow</comment>
|
||||
<entry key="startId">0</entry>
|
||||
<entry key="endId">200000</entry>
|
||||
<entry key="batchSize">5000</entry>
|
||||
<entry key="newComing">false</entry>
|
||||
</properties>
|
|
@ -33,703 +33,4 @@ will
|
|||
with
|
||||
小
|
||||
一个
|
||||
的
|
||||
|
||||
codekeywords:
|
||||
private
|
||||
protected
|
||||
public
|
||||
abstract
|
||||
class
|
||||
extends
|
||||
final
|
||||
implements
|
||||
interface
|
||||
native
|
||||
new
|
||||
static
|
||||
strictfp
|
||||
synchronized
|
||||
transient
|
||||
volatile
|
||||
break
|
||||
continue
|
||||
return
|
||||
do
|
||||
while
|
||||
if
|
||||
else
|
||||
for
|
||||
instanceof
|
||||
switch
|
||||
case
|
||||
defult
|
||||
catch
|
||||
finally
|
||||
throw
|
||||
throws
|
||||
try
|
||||
import
|
||||
package
|
||||
boolean
|
||||
byte
|
||||
char
|
||||
double
|
||||
float
|
||||
int
|
||||
long
|
||||
short
|
||||
null
|
||||
true
|
||||
false
|
||||
super
|
||||
this
|
||||
void
|
||||
form
|
||||
do
|
||||
sourceforge
|
||||
com
|
||||
cn
|
||||
www
|
||||
http
|
||||
js
|
||||
script
|
||||
a
|
||||
test
|
||||
error
|
||||
exception
|
||||
about
|
||||
website
|
||||
able
|
||||
abstract
|
||||
console
|
||||
sql
|
||||
windows
|
||||
exe
|
||||
txt
|
||||
doc
|
||||
xls
|
||||
local
|
||||
net
|
||||
web
|
||||
server
|
||||
from
|
||||
using
|
||||
error
|
||||
can
|
||||
date
|
||||
file
|
||||
xml
|
||||
can
|
||||
system
|
||||
url
|
||||
value
|
||||
create
|
||||
text
|
||||
set
|
||||
get
|
||||
list
|
||||
table
|
||||
select
|
||||
distinct
|
||||
object
|
||||
open
|
||||
close
|
||||
clear
|
||||
all
|
||||
time
|
||||
have
|
||||
org
|
||||
main
|
||||
start
|
||||
end
|
||||
version
|
||||
private
|
||||
public
|
||||
index
|
||||
api
|
||||
method
|
||||
source
|
||||
root
|
||||
content
|
||||
write
|
||||
read
|
||||
view
|
||||
one
|
||||
page
|
||||
run
|
||||
log
|
||||
win
|
||||
bin
|
||||
first
|
||||
next
|
||||
src
|
||||
app
|
||||
request
|
||||
println
|
||||
namespace
|
||||
printf
|
||||
button
|
||||
title
|
||||
local
|
||||
define
|
||||
lib
|
||||
make
|
||||
files
|
||||
help
|
||||
article
|
||||
bool
|
||||
boolean
|
||||
config
|
||||
load
|
||||
args
|
||||
date
|
||||
thread
|
||||
cpu
|
||||
more
|
||||
document
|
||||
util
|
||||
info
|
||||
home
|
||||
style
|
||||
body
|
||||
print
|
||||
hello
|
||||
world
|
||||
find
|
||||
left
|
||||
std
|
||||
debug
|
||||
etc
|
||||
like
|
||||
top
|
||||
now
|
||||
map
|
||||
context
|
||||
other
|
||||
post
|
||||
format
|
||||
client
|
||||
encoding
|
||||
session
|
||||
program
|
||||
datebase
|
||||
control
|
||||
language
|
||||
base
|
||||
process
|
||||
want
|
||||
values
|
||||
used
|
||||
status
|
||||
project
|
||||
color
|
||||
array
|
||||
loaclhost
|
||||
please
|
||||
click
|
||||
integer
|
||||
example
|
||||
archive
|
||||
response
|
||||
command
|
||||
event
|
||||
param
|
||||
what
|
||||
core
|
||||
build
|
||||
link
|
||||
display
|
||||
copy
|
||||
stdio
|
||||
release
|
||||
mode
|
||||
after
|
||||
check
|
||||
cpp
|
||||
c
|
||||
port
|
||||
target
|
||||
should
|
||||
library
|
||||
lib
|
||||
tcp
|
||||
udp
|
||||
state
|
||||
sum
|
||||
free
|
||||
last
|
||||
connect
|
||||
configuration
|
||||
none
|
||||
download
|
||||
software
|
||||
buffer
|
||||
query
|
||||
bit
|
||||
temp
|
||||
word
|
||||
block
|
||||
two
|
||||
send
|
||||
device
|
||||
layout
|
||||
sdk
|
||||
cache
|
||||
alert
|
||||
see
|
||||
change
|
||||
work
|
||||
kernel
|
||||
filter
|
||||
handel
|
||||
save
|
||||
tools
|
||||
min
|
||||
count
|
||||
header
|
||||
level
|
||||
framework
|
||||
sys
|
||||
email
|
||||
push
|
||||
pull
|
||||
heap
|
||||
stack
|
||||
user
|
||||
must
|
||||
demo
|
||||
just
|
||||
serach
|
||||
res
|
||||
tmp
|
||||
down
|
||||
option
|
||||
img
|
||||
network
|
||||
model
|
||||
stop
|
||||
problem
|
||||
position
|
||||
node
|
||||
space
|
||||
login
|
||||
font
|
||||
todo
|
||||
background
|
||||
resource
|
||||
mac
|
||||
bytes
|
||||
ios
|
||||
empty
|
||||
note
|
||||
self
|
||||
tag
|
||||
column
|
||||
studio
|
||||
red
|
||||
done
|
||||
baidu
|
||||
sleep
|
||||
convert
|
||||
global
|
||||
field
|
||||
reference
|
||||
way
|
||||
settings
|
||||
simple
|
||||
ctrl
|
||||
wait
|
||||
meta
|
||||
edit
|
||||
runtime
|
||||
store
|
||||
equals
|
||||
services
|
||||
bean
|
||||
ftp
|
||||
exec
|
||||
non
|
||||
enter
|
||||
lock
|
||||
admin
|
||||
configure
|
||||
png
|
||||
jpg
|
||||
abc
|
||||
loop
|
||||
vector
|
||||
setup
|
||||
mail
|
||||
flag
|
||||
machine
|
||||
sample
|
||||
share
|
||||
image
|
||||
ref
|
||||
join
|
||||
where
|
||||
parameter
|
||||
users
|
||||
contain
|
||||
know
|
||||
template
|
||||
class
|
||||
math
|
||||
pdf
|
||||
parse
|
||||
timeout
|
||||
cat
|
||||
through
|
||||
basic
|
||||
invoke
|
||||
resources
|
||||
media
|
||||
uri
|
||||
paltform
|
||||
err
|
||||
sudo
|
||||
range
|
||||
menu
|
||||
mapping
|
||||
tables
|
||||
master
|
||||
memset
|
||||
reset
|
||||
environment
|
||||
pop
|
||||
commit
|
||||
report
|
||||
task
|
||||
active
|
||||
domain
|
||||
good
|
||||
step
|
||||
setting
|
||||
day
|
||||
hash
|
||||
move
|
||||
collections
|
||||
phone
|
||||
engine
|
||||
callback
|
||||
datetime
|
||||
objects
|
||||
mobile
|
||||
profile
|
||||
always
|
||||
storage
|
||||
alt
|
||||
trim
|
||||
applications
|
||||
via
|
||||
design
|
||||
nothing
|
||||
makefile
|
||||
clean
|
||||
small
|
||||
solution
|
||||
custom
|
||||
people
|
||||
projects
|
||||
family
|
||||
attributes
|
||||
enum
|
||||
own
|
||||
works
|
||||
messages
|
||||
low
|
||||
signal
|
||||
browser
|
||||
give
|
||||
aaa
|
||||
event
|
||||
assert
|
||||
look
|
||||
company
|
||||
person
|
||||
either
|
||||
case
|
||||
desktop
|
||||
touch
|
||||
book
|
||||
unit
|
||||
docs
|
||||
params
|
||||
dump
|
||||
pool
|
||||
feature
|
||||
hashmap
|
||||
seconds
|
||||
owner
|
||||
goto
|
||||
keys
|
||||
editor
|
||||
env
|
||||
month
|
||||
year
|
||||
loading
|
||||
changes
|
||||
black
|
||||
fetch
|
||||
guide
|
||||
enterprise
|
||||
multi
|
||||
foundation
|
||||
reflect
|
||||
unique
|
||||
upload
|
||||
play
|
||||
idea
|
||||
exist
|
||||
elements
|
||||
account
|
||||
numbers
|
||||
examples
|
||||
plain
|
||||
solid
|
||||
game
|
||||
hide
|
||||
apps
|
||||
symbol
|
||||
layer
|
||||
least
|
||||
weight
|
||||
beta
|
||||
jpeg
|
||||
attr
|
||||
refresh
|
||||
apply
|
||||
focus
|
||||
fun
|
||||
cfg
|
||||
threads
|
||||
receive
|
||||
recieved
|
||||
easy
|
||||
hard
|
||||
mode
|
||||
logs
|
||||
password
|
||||
passwd
|
||||
packages
|
||||
ignore
|
||||
scripts
|
||||
cell
|
||||
later
|
||||
plus
|
||||
his
|
||||
pub
|
||||
going
|
||||
logging
|
||||
channel
|
||||
think
|
||||
serial
|
||||
things
|
||||
loader
|
||||
player
|
||||
libs
|
||||
team
|
||||
backup
|
||||
conector
|
||||
side
|
||||
utf8
|
||||
speed
|
||||
doing
|
||||
price
|
||||
timestamp
|
||||
repository
|
||||
init
|
||||
HelloWorld
|
||||
tesing
|
||||
scan
|
||||
components
|
||||
speed
|
||||
unlock
|
||||
products
|
||||
mid
|
||||
said
|
||||
token
|
||||
optional
|
||||
today
|
||||
rest
|
||||
pack
|
||||
coding
|
||||
contain
|
||||
points
|
||||
days
|
||||
safe
|
||||
typeof
|
||||
community
|
||||
others
|
||||
manual
|
||||
sets
|
||||
closed
|
||||
fine
|
||||
groups
|
||||
specify
|
||||
sources
|
||||
workder
|
||||
problems
|
||||
perform
|
||||
servers
|
||||
articles
|
||||
skip
|
||||
blocks
|
||||
connections
|
||||
related
|
||||
notify
|
||||
onload
|
||||
known
|
||||
assembly
|
||||
useful
|
||||
monitor
|
||||
thing
|
||||
forum
|
||||
come
|
||||
notification
|
||||
rules
|
||||
developers
|
||||
cross
|
||||
notice
|
||||
soft
|
||||
progress
|
||||
fork
|
||||
hosts
|
||||
logger
|
||||
role
|
||||
everything
|
||||
future
|
||||
references
|
||||
jni
|
||||
locale
|
||||
follow
|
||||
words
|
||||
updates
|
||||
necessary
|
||||
writer
|
||||
mouse
|
||||
sign
|
||||
repoter
|
||||
containing
|
||||
route
|
||||
analysis
|
||||
four
|
||||
comments
|
||||
programs
|
||||
maps
|
||||
turn
|
||||
versions
|
||||
drive
|
||||
parser
|
||||
track
|
||||
money
|
||||
utils
|
||||
reload
|
||||
decimal
|
||||
resume
|
||||
behavior
|
||||
hand
|
||||
board
|
||||
light
|
||||
regex
|
||||
terminal
|
||||
reduce
|
||||
requests
|
||||
fff
|
||||
0xfffff
|
||||
strong
|
||||
edge
|
||||
schedule
|
||||
pair
|
||||
swf
|
||||
tom
|
||||
dependency
|
||||
rule
|
||||
escape
|
||||
hook
|
||||
temporary
|
||||
issues
|
||||
exchange
|
||||
review
|
||||
various
|
||||
life
|
||||
friend
|
||||
face
|
||||
complex
|
||||
readme
|
||||
prop
|
||||
expert
|
||||
extend
|
||||
often
|
||||
automatic
|
||||
capture
|
||||
difference
|
||||
scheme
|
||||
develop
|
||||
together
|
||||
steps
|
||||
reply
|
||||
beginning
|
||||
understand
|
||||
identified
|
||||
wall
|
||||
along
|
||||
market
|
||||
minute
|
||||
hour
|
||||
university
|
||||
plan
|
||||
templates
|
||||
codes
|
||||
ccc
|
||||
diff
|
||||
ppt
|
||||
study
|
||||
abcd
|
||||
lost
|
||||
choice
|
||||
explain
|
||||
likely
|
||||
guid
|
||||
taken
|
||||
period
|
||||
FAQ
|
||||
symbols
|
||||
thus
|
||||
tips
|
||||
compute
|
||||
school
|
||||
languages
|
||||
describe
|
||||
resolution
|
||||
deal
|
||||
detect
|
||||
yellow
|
||||
hope
|
||||
visit
|
||||
xyz
|
||||
iii
|
||||
dog
|
||||
jump
|
||||
knowledge
|
||||
publish
|
||||
inserted
|
||||
levels
|
||||
chip
|
||||
preference
|
||||
done
|
||||
adjust
|
||||
earth
|
||||
expressions
|
||||
house
|
||||
central
|
||||
rich
|
||||
chat
|
||||
sessions
|
||||
deep
|
||||
when
|
||||
where
|
||||
how
|
||||
who
|
||||
仅供参考
|
||||
的
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,7 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">
|
||||
<properties>
|
||||
<comment>TableFlow</comment>
|
||||
<entry key="startTableId">1</entry>
|
||||
<entry key="endTableId">20</entry>
|
||||
</properties>
|
|
@ -0,0 +1,78 @@
|
|||
package com.ossean.projectmanager.model;
|
||||
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.net.URL;
|
||||
import java.util.ArrayList;
|
||||
import java.util.InvalidPropertiesFormatException;
|
||||
import java.util.List;
|
||||
import java.util.Properties;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
public class Configure {
|
||||
|
||||
private Properties prop = new Properties();
|
||||
Logger log4j = Logger.getLogger(Configure.class);
|
||||
private int startTableId;
|
||||
private int endTableId;
|
||||
|
||||
//自定义构造函数
|
||||
public Configure(String confPath) {
|
||||
confPath = confPath + ".xml";
|
||||
try {
|
||||
URL url = ClassLoader.getSystemResource(confPath);
|
||||
InputStream is = url.openStream();
|
||||
prop.loadFromXML(is);
|
||||
} catch (FileNotFoundException e) {
|
||||
log4j.error("Configure FileNotFoundException !");
|
||||
e.printStackTrace();
|
||||
} catch (InvalidPropertiesFormatException e) {
|
||||
e.printStackTrace();
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
config();
|
||||
}
|
||||
|
||||
private void config() {
|
||||
this.startTableId = Integer.parseInt(prop.getProperty("startTableId"));
|
||||
this.endTableId = Integer.parseInt(prop.getProperty("endTableId"));
|
||||
}
|
||||
|
||||
public Properties getProp() {
|
||||
return prop;
|
||||
}
|
||||
public void setProp(Properties prop) {
|
||||
this.prop = prop;
|
||||
}
|
||||
public Logger getLog4j() {
|
||||
return log4j;
|
||||
}
|
||||
public void setLog4j(Logger log4j) {
|
||||
this.log4j = log4j;
|
||||
}
|
||||
|
||||
public int getStartTableId() {
|
||||
return startTableId;
|
||||
}
|
||||
|
||||
public void setStartTableId(int startTableId) {
|
||||
this.startTableId = startTableId;
|
||||
}
|
||||
|
||||
public int getEndTableId() {
|
||||
return endTableId;
|
||||
}
|
||||
|
||||
public void setEndTableId(int endTableId) {
|
||||
this.endTableId = endTableId;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
|
@ -9,6 +9,7 @@ import org.springframework.stereotype.Component;
|
|||
|
||||
import com.ossean.projectmanager.lasttabledao.RelativeMemoDao;
|
||||
import com.ossean.projectmanager.lasttabledao.RelativeMemoToOpenSourceProjectDao;
|
||||
import com.ossean.projectmanager.model.Configure;
|
||||
import com.ossean.projectmanager.model.RelativeMemo;
|
||||
import com.ossean.projectmanager.model.RelativeMemoToOpenSourceProject;
|
||||
|
||||
|
@ -21,6 +22,7 @@ public class SyncMatchResult {
|
|||
private RelativeMemoToOpenSourceProjectDao matchResultDao;
|
||||
|
||||
Logger logger = Logger.getLogger(this.getClass());
|
||||
private Configure conf;
|
||||
|
||||
/**
|
||||
* 从帖子表中同步帖子信息到匹配结果表
|
||||
|
@ -28,39 +30,32 @@ public class SyncMatchResult {
|
|||
* @throws InterruptedException
|
||||
*/
|
||||
public void syncMatchResult() {
|
||||
while (true) {
|
||||
// int notSyncNum = matchResultDao.getNotSyncCount("relative_memo_to_open_source_projects_1");
|
||||
// if (notSyncNum < 1) {
|
||||
// try {
|
||||
// logger.info("sleeping 10s......");
|
||||
// Thread.sleep(10000);
|
||||
// } catch (InterruptedException e) {
|
||||
// logger.error(e);
|
||||
// }
|
||||
// }
|
||||
for (int i = 1; i <= 70; i++) {
|
||||
String matchResultTableName = "relative_memo_to_open_source_projects_" + String.valueOf(i);
|
||||
//logger.info("starting sync " + matchResultTableName + "......");
|
||||
//long startReadMT = System.currentTimeMillis();
|
||||
List<RelativeMemoToOpenSourceProject> rmtospList = matchResultDao.getMatchResultsByHassync(matchResultTableName, 50000);
|
||||
//long endReadMT = System.currentTimeMillis();
|
||||
//logger.info("get rmtospList: " + (endReadMT - startReadMT)/6000 + " s");
|
||||
if (rmtospList !=null && rmtospList.size() > 0) {
|
||||
for (RelativeMemoToOpenSourceProject rmtosp : rmtospList) {
|
||||
int curMemoId = rmtosp.getRelative_memo_id();
|
||||
RelativeMemo curMemo = lastMemoDao.getById(curMemoId);
|
||||
rmtosp.setCreated_time(curMemo.getCreated_time());
|
||||
rmtosp.setMemo_type(curMemo.getMemo_type());
|
||||
rmtosp.setReplies_num(curMemo.getReplies_num());
|
||||
rmtosp.setView_num_crawled(curMemo.getView_num());
|
||||
matchResultDao.syncMatchResult(matchResultTableName, rmtosp);
|
||||
logger.info("updated " + rmtosp.getId() + "......");
|
||||
}
|
||||
conf = new Configure("configure");
|
||||
int startTableId = conf.getStartTableId();
|
||||
int endTableId = conf.getEndTableId();
|
||||
for (int i = startTableId; i <= endTableId; i++) {
|
||||
String matchResultTableName = "relative_memo_to_open_source_projects_" + String.valueOf(i);
|
||||
logger.info("starting sync " + matchResultTableName + "......");
|
||||
List<RelativeMemoToOpenSourceProject> rmtospList = matchResultDao.getMatchResultsByHassync(matchResultTableName, 50000);
|
||||
long startCircle = System.currentTimeMillis();
|
||||
while(rmtospList !=null && rmtospList.size() > 0){
|
||||
for (RelativeMemoToOpenSourceProject rmtosp : rmtospList) {
|
||||
int curMemoId = rmtosp.getRelative_memo_id();
|
||||
RelativeMemo curMemo = lastMemoDao.getById(curMemoId);
|
||||
rmtosp.setCreated_time(curMemo.getCreated_time());
|
||||
rmtosp.setMemo_type(curMemo.getMemo_type());
|
||||
rmtosp.setReplies_num(curMemo.getReplies_num());
|
||||
rmtosp.setView_num_crawled(curMemo.getView_num());
|
||||
matchResultDao.syncMatchResult(matchResultTableName, rmtosp);
|
||||
|
||||
}
|
||||
//long endCircle = System.currentTimeMillis();
|
||||
//logger.info("endCircle : " + (endCircle - endReadMT)/6000 +" s");
|
||||
logger.info("updated " + rmtospList.get(rmtospList.size()-1).getId() + "......");
|
||||
rmtospList = matchResultDao.getMatchResultsByHassync(matchResultTableName, 50000);
|
||||
}
|
||||
}
|
||||
long endCircle = System.currentTimeMillis();
|
||||
logger.info(matchResultTableName+" sync stop cost : " + (endCircle - startCircle)/6000 +" s");
|
||||
}
|
||||
logger.info("all match table have sync over!!!!");
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -21,7 +21,7 @@
|
|||
<property name="url"
|
||||
value="jdbc:mysql://localhost:3306/ossean_production?characterEncoding=UTF-8&zeroDateTimeBehavior=convertToNull&autoReconnect=true" />
|
||||
<property name="username" value="root" />
|
||||
<property name="password" value="1234" />
|
||||
<property name="password" value="123456" />
|
||||
<property name="validationQuery" value="SELECT 1" />
|
||||
<property name="testOnBorrow" value="true"/>
|
||||
</bean>
|
||||
|
@ -41,7 +41,7 @@
|
|||
<property name="url"
|
||||
value="jdbc:mysql://localhost:3306/ossean_production?characterEncoding=UTF-8&zeroDateTimeBehavior=convertToNull&autoReconnect=true" />
|
||||
<property name="username" value="root" />
|
||||
<property name="password" value="1234" />
|
||||
<property name="password" value="123456" />
|
||||
<property name="validationQuery" value="SELECT 1" />
|
||||
<property name="testOnBorrow" value="true"/>
|
||||
</bean>
|
||||
|
|
|
@ -0,0 +1,7 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">
|
||||
<properties>
|
||||
<comment>TableFlow</comment>
|
||||
<entry key="startTableId">1</entry>
|
||||
<entry key="endTableId">2</entry>
|
||||
</properties>
|
|
@ -15,8 +15,14 @@
|
|||
<arguments>
|
||||
</arguments>
|
||||
</buildCommand>
|
||||
<buildCommand>
|
||||
<name>org.springframework.ide.eclipse.core.springbuilder</name>
|
||||
<arguments>
|
||||
</arguments>
|
||||
</buildCommand>
|
||||
</buildSpec>
|
||||
<natures>
|
||||
<nature>org.springframework.ide.eclipse.core.springnature</nature>
|
||||
<nature>org.eclipse.jdt.core.javanature</nature>
|
||||
<nature>org.eclipse.m2e.core.maven2Nature</nature>
|
||||
</natures>
|
||||
|
|
|
@ -17,7 +17,7 @@ angularjs
|
|||
angular2
|
||||
angular
|
||||
node.js
|
||||
jqury
|
||||
jquery
|
||||
html
|
||||
spark
|
||||
nginx
|
||||
|
|
|
@ -6,7 +6,15 @@ import java.util.HashSet;
|
|||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.Date;
|
||||
import java.text.DateFormat;
|
||||
import java.text.ParseException;
|
||||
import java.text.SimpleDateFormat;
|
||||
import javax.annotation.Resource;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
|
@ -16,8 +24,10 @@ import org.springframework.stereotype.Component;
|
|||
import org.springframework.transaction.annotation.Propagation;
|
||||
import org.springframework.transaction.annotation.Transactional;
|
||||
|
||||
import com.ossean.dao.DBDest;
|
||||
import com.ossean.dao.DBSource;
|
||||
import com.ossean.dao.GatherDao;
|
||||
import com.ossean.dao2.PlatformProjectDao;
|
||||
import com.ossean.model.GatherProjectsModel;
|
||||
import com.ossean.util.EDDHandler;
|
||||
import com.ossean.util.FileReader;
|
||||
|
@ -32,12 +42,22 @@ public class GetSynonyms{
|
|||
@Resource
|
||||
private DBSource dbSource;
|
||||
@Resource
|
||||
public DBDest dbDest;
|
||||
@Resource
|
||||
private GatherDao gatherDao;
|
||||
@Resource
|
||||
private PlatformProjectDao paltformDao;
|
||||
|
||||
private static String gatherProjectsTableName = TableName.gatherProjectsTableName;
|
||||
private static String gatherProjectsTableName ="dubsynonymsprj_test";//TableName.gatherProjectsTableName;
|
||||
private static String synonymsTableName = TableName.synonymsTableName;
|
||||
private static String pointerTableName = TableName.pointerTableName;
|
||||
private static int batchSize = 1000;
|
||||
|
||||
|
||||
//为了很好的将别名提取的程序结构化,特设以下三个全局变量在从项目名称中提取别名时使用
|
||||
String projectNameWithoutBracket = "";//项目名去除括号
|
||||
String projectNameWithoutComName = "";//项目名称去除括号和公司、基金名字
|
||||
String bracket = "";//括号中内容
|
||||
|
||||
public void start() {
|
||||
long start_all = System.currentTimeMillis();
|
||||
|
@ -63,19 +83,20 @@ public class GetSynonyms{
|
|||
}
|
||||
long start = System.currentTimeMillis();
|
||||
for (GatherProjectsModel model : gpmList) {
|
||||
Map<String, Integer> synonymsList = getSynonyms(model);
|
||||
updateTable(synonymsList,model);
|
||||
// if(synonymsList !=null && synonymsList.size()!=0)
|
||||
// for (String name : synonymsList.keySet()) {
|
||||
// dbSource.insertSynonymsNew(synonymsTableName,
|
||||
// model.getId(), name, synonymsList.get(name));
|
||||
// }
|
||||
// // 更新update_mark=1
|
||||
// gatherDao.updateMark(gatherProjectsTableName, 1, model.getId());
|
||||
Map<String, Integer> synonymMap = getSynonyms(model);
|
||||
|
||||
try
|
||||
{
|
||||
//当多个项目具有相同别名的时候,选出最适合的那个项目,并将其余项目的该别名所对应的flag置0
|
||||
synonymMap = synonymsFilter(synonymMap,synonymsTableName,model);
|
||||
}
|
||||
catch(Exception e)
|
||||
{
|
||||
logger.error("prjId:" + model.getId() + " 在处理多个项目有相同别名的时候出现错误:" + e);
|
||||
}
|
||||
updateTable(synonymMap,model);
|
||||
}
|
||||
// long end = System.currentTimeMillis();
|
||||
// logger.warn("insert synonyms:deal with "+gpmList.size()+" projects cost: "+(float)(end
|
||||
// - start)/60000+" minutes");
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -89,6 +110,167 @@ public class GetSynonyms{
|
|||
// 更新update_mark=1
|
||||
gatherDao.updateMark(gatherProjectsTableName, 1, model.getId());
|
||||
}
|
||||
public Map<String,Integer> synonymsFilter(Map<String,Integer> synonymMap,String targetTable,GatherProjectsModel model)
|
||||
{
|
||||
Map<String,Integer> copy = synonymMap;
|
||||
for(String key:copy.keySet())
|
||||
{
|
||||
if(copy.get(key) == 1)
|
||||
{
|
||||
GatherProjectsModel tmp = model;
|
||||
//通过别名查找别名表中具有相同别名且flag为1的项目Id
|
||||
Set<Integer> dupSynonymPrj =getSameSynonymPrj(key,targetTable);
|
||||
if(dupSynonymPrj != null)
|
||||
{
|
||||
Iterator<Integer> iterator = dupSynonymPrj.iterator();
|
||||
while(iterator.hasNext())
|
||||
{
|
||||
GatherProjectsModel prj = dbSource.getGatherProjectById(gatherProjectsTableName ,iterator.next());
|
||||
//迭代求取最优项目,保证tmp始终代表最适合这一别名的项目
|
||||
tmp = getBetterPrj(tmp,prj);
|
||||
}
|
||||
}
|
||||
|
||||
if(tmp.getId()==model.getId() && tmp != null)
|
||||
{
|
||||
//当前项目为最适合这一别名的项目,就更新别名表中其余项目的flag为0
|
||||
dbDest.updateSynoymmings(targetTable,key,model.getId(),0);
|
||||
}
|
||||
else
|
||||
{
|
||||
synonymMap.put(key, 0);
|
||||
dbDest.updateSynoymmings(targetTable, key, tmp.getId(), 0);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
return synonymMap;
|
||||
|
||||
}
|
||||
|
||||
public Set<Integer> getSameSynonymPrj(String key,String targetTable)
|
||||
{
|
||||
Set<Integer> dupSynonymPrj = new HashSet<Integer>();
|
||||
dupSynonymPrj = dbDest.selectSameSynonymPrj(targetTable,key,1);
|
||||
|
||||
return dupSynonymPrj;
|
||||
}
|
||||
|
||||
/**
|
||||
* 比较两个项目哪一个更适合某一别名
|
||||
* */
|
||||
private GatherProjectsModel getBetterPrj(GatherProjectsModel model1, GatherProjectsModel model2) {
|
||||
// TODO Auto-generated method stub
|
||||
GatherProjectsModel betterPrj=null;
|
||||
//不同的社区赋给不同的值
|
||||
int communityValue1 = getCommunityValue(model1);
|
||||
int communityValue2 = getCommunityValue(model2);
|
||||
//如果来自同一社区就从抽取数据库中查询其有代表意义的数据比较热度
|
||||
if(communityValue1 == communityValue2)
|
||||
{
|
||||
if(communityValue1 == -1)
|
||||
return model1;
|
||||
else
|
||||
{
|
||||
if(communityValue1 == 4)
|
||||
{
|
||||
/**oschina用star_num具有代表性,
|
||||
**用List是因为抽取数据库中未经汇总,有相同项目存在,只需求出list取第一个值即可
|
||||
**/
|
||||
List<Integer> hot1 = paltformDao.getHotValue("oschina_project","star_num",model1.getUrl_md5());
|
||||
List<Integer> hot2 = paltformDao.getHotValue("oschina_project","star_num",model2.getUrl_md5());
|
||||
if(hot1.size()>0 && hot2.size()>0)return hot1.get(0) > hot2.get(0) ? model1:model2;
|
||||
else
|
||||
if(hot1.size() == 0) return model2;
|
||||
else return model1;
|
||||
|
||||
}
|
||||
if(communityValue1 == 3)
|
||||
{
|
||||
List<Integer> hot1 = paltformDao.getHotValue("openhub_project","user_num",model1.getUrl_md5());
|
||||
List<Integer> hot2 = paltformDao.getHotValue("openhub_project","user_num",model2.getUrl_md5());
|
||||
if(hot1.size()>0 && hot2.size()>0)return hot1.get(0) > hot2.get(0) ? model1:model2;
|
||||
else
|
||||
if(hot1.size() == 0) return model2;
|
||||
else return model1;
|
||||
}
|
||||
if(communityValue1 == 2)
|
||||
{
|
||||
List<Integer> hot1 = paltformDao.getHotValue("sourceforge_project","download_num",model1.getUrl_md5());
|
||||
List<Integer> hot2 = paltformDao.getHotValue("sourceforge_project","download_num",model2.getUrl_md5());
|
||||
if(hot1.size()>0 && hot2.size()>0)return hot1.get(0) > hot2.get(0) ? model1:model2;
|
||||
else
|
||||
if(hot1.size() == 0) return model2;
|
||||
else return model1;
|
||||
}
|
||||
if(communityValue1 == 1)
|
||||
{
|
||||
List<Integer> hot1 = paltformDao.getHotValue("github","star_num",model1.getUrl_md5());
|
||||
List<Integer> hot2 = paltformDao.getHotValue("github","star_num",model2.getUrl_md5());
|
||||
if(hot1.size()>0 && hot2.size()>0)return hot1.get(0) > hot2.get(0) ? model1:model2;
|
||||
else
|
||||
if(hot1.size() == 0) return model2;
|
||||
else return model1;
|
||||
}
|
||||
if(communityValue1 == 0)
|
||||
{
|
||||
//freecode项目抽取结果中只有创建时间可以利用,创建时间较早的认为比较可信
|
||||
DateFormat df1 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
|
||||
DateFormat df2 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
|
||||
List<String> time1 = paltformDao.getTime("gather_projects", "extracted_time", model1.getUrl_md5());
|
||||
List<String> time2 = paltformDao.getTime("gather_projects", "extracted_time",model2.getUrl_md5());
|
||||
if(time1!=null && time2!=null)
|
||||
try {
|
||||
Date dt1 = df1.parse(time1.get(0));
|
||||
Date dt2 = df2.parse(time2.get(0));
|
||||
if(dt1 !=null && dt2!=null)
|
||||
{
|
||||
if(dt1.getTime() > dt2.getTime())
|
||||
return model1;
|
||||
else
|
||||
return model2;
|
||||
}
|
||||
} catch (ParseException e) {
|
||||
// TODO Auto-generated catch block
|
||||
e.printStackTrace();
|
||||
}
|
||||
else
|
||||
if(time1 != null)
|
||||
return model1;
|
||||
else
|
||||
return model2;
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
return communityValue1 > communityValue2 ? model1:model2;
|
||||
|
||||
return betterPrj;
|
||||
}
|
||||
/**
|
||||
* 根据项目的不同社区,赋给其不同的社区热度值
|
||||
* */
|
||||
private static int getCommunityValue(GatherProjectsModel model) {
|
||||
// TODO Auto-generated method stub
|
||||
String source = model.getSource();
|
||||
|
||||
if(source.equals("oschina"))
|
||||
return 4;
|
||||
if(source.equals("openhub"))
|
||||
return 3;
|
||||
if(source.equals("sourceforge"))
|
||||
return 2;
|
||||
if(source.equals("github"))
|
||||
return 1;
|
||||
if(source.equals("freecode"))
|
||||
return 0;
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
ApplicationContext applicationContext = new ClassPathXmlApplicationContext(
|
||||
|
@ -99,207 +281,517 @@ public class GetSynonyms{
|
|||
}
|
||||
|
||||
public Map<String, Integer> getSynonyms(GatherProjectsModel model) {
|
||||
|
||||
if(model.getSource().equals("github"))
|
||||
{
|
||||
return getSynonymsFromGithubPrj(model);//处理github项目
|
||||
}
|
||||
else
|
||||
{
|
||||
return getSynonymsNotGitPrj(model);//处理非github的其他项目
|
||||
}
|
||||
}
|
||||
|
||||
@SuppressWarnings("finally")
|
||||
public Map<String, Integer> getSynonymsFromGithubPrj(GatherProjectsModel model)
|
||||
{
|
||||
|
||||
Map<String, Integer> SynonymMap = new HashMap<String, Integer>();// 记录每个别名及其准确度,<Sysnonym,1>表时Synonym可靠性很大
|
||||
List<String> synonymsList = new ArrayList<String>();// 最终返回的同义词列表
|
||||
List<String> synonymsList_notsure = new ArrayList<String>();
|
||||
model.setName(model.getName().trim());
|
||||
if(model.getDescription()!=null && !model.getDescription().equals(""))
|
||||
if(model.getDescription() != null)
|
||||
{
|
||||
model.setDescription(model.getDescription().trim());
|
||||
}
|
||||
else
|
||||
{
|
||||
model.setDescription("");
|
||||
}
|
||||
|
||||
//记录每个别名及其准确度,<Sysnonym,1>表时Synonym可靠性很大
|
||||
Map<String,Integer> SynonymMap = new HashMap<String,Integer>();
|
||||
|
||||
//最终返回的可以确定正确的同义词列表
|
||||
List<String> synonymsList = new ArrayList<String>();
|
||||
|
||||
String description = model.getDescription();
|
||||
String prjName = model.getName();
|
||||
|
||||
try
|
||||
{
|
||||
//github项目名称中有斜杠,提取斜杠后内容做为别名
|
||||
String subName = getSubName(prjName);
|
||||
if(subName != null)
|
||||
{
|
||||
SynonymMap.put(subName, 1);
|
||||
}
|
||||
|
||||
|
||||
//将subName中短横线、下划线用空格代替以及将短横线、下划线删除后获取的字符串当做别名
|
||||
String strReplace = ReplaceSpecial(subName);
|
||||
if(!strReplace.equals(subName))
|
||||
{
|
||||
synonymsList.add(strReplace);
|
||||
}
|
||||
|
||||
//删除特殊符号如短横线、下划线,如果其包含在description中则做为别名
|
||||
String strDelete = DeleteSpecial(subName);
|
||||
if(!strDelete.equals(subName))
|
||||
{
|
||||
if(description.toLowerCase().contains(strDelete.toLowerCase()))
|
||||
{
|
||||
synonymsList.add(strDelete);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if(description == null)
|
||||
{
|
||||
//如果描述信息为空,别名就只包括已经提取出的和从名字中获取的别名
|
||||
if(synonymsList != null)
|
||||
for(String sure:synonymsList)
|
||||
SynonymMap.put(sure, 1);
|
||||
|
||||
return SynonymMap;
|
||||
}
|
||||
|
||||
//描述信息非空时,尝试从description中提取别名
|
||||
Map<String, Integer> synonymsFromDesc = getSynonymsFromDescOfGitprj(description,model.getName());
|
||||
|
||||
//将别名连同其确定性一起放入SynonymMap中
|
||||
for(String tmpSyn:synonymsList)
|
||||
SynonymMap.put(tmpSyn, 1);
|
||||
|
||||
//将从描述信息中获取的别名及其确定性放入SynonymMap
|
||||
if(synonymsFromDesc != null)
|
||||
for(String key:synonymsFromDesc.keySet())
|
||||
SynonymMap.put(key, synonymsFromDesc.get(key));
|
||||
|
||||
return SynonymMap;
|
||||
}catch(Exception e)
|
||||
{
|
||||
e.printStackTrace();
|
||||
logger.error("Error while extracting synonyms from project:" + model.getId());
|
||||
System.exit(0);
|
||||
}finally
|
||||
{
|
||||
return SynonymMap;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private Map<String, Integer> getSynonymsNotGitPrj(GatherProjectsModel model) {
|
||||
model.setName(model.getName().trim());
|
||||
if(model.getDescription() != null)
|
||||
model.setDescription(model.getDescription().trim());
|
||||
|
||||
List<String> synonymsList = new ArrayList<String>();//能够确定正确的别名
|
||||
List<String> synonymsList_notsure = new ArrayList<String>();//不能百分百确定的别名
|
||||
|
||||
String description = model.getDescription();
|
||||
String prjName = model.getName();
|
||||
|
||||
List<String> result = null;
|
||||
if (description == null) {
|
||||
if(description == null)
|
||||
{
|
||||
//如果description为null,则别名无需再做改动直接返回null
|
||||
return null;
|
||||
}
|
||||
|
||||
List<String> linkNameList = new ArrayList<String>();// 用于存储所有提取出来的关联项目名
|
||||
List<String> keywords = FileReader.read("./files/language.txt");
|
||||
|
||||
// 对项目名进行处理,去除公司名或基金组织的信息&&去除括号(没有公司没有括号则包含自己)
|
||||
String projectNameWithoutBracket = StringHandler_ProjectName
|
||||
.removeBracket(prjName);
|
||||
String projectNameWithoutComName = StringHandler_ProjectName
|
||||
.getProjectWithoutComName(projectNameWithoutBracket);
|
||||
// String projectNameWithoutComName =
|
||||
// StringHandler_ProjectName.getProjectWithoutComName(prjName);
|
||||
// projectNameWithoutComName =
|
||||
// StringHandler_ProjectName.removeBracket(projectNameWithoutComName);
|
||||
if (!"".equals(projectNameWithoutComName)) {
|
||||
|
||||
List<String> linkNameList = new ArrayList<String>();//用于存储所有提取出来的可能是别名的词
|
||||
|
||||
List<String> keywords = FileReader.read("./files/language.txt");//用于过滤类似于"Java","C++"这样的词
|
||||
projectNameWithoutBracket = StringHandler_ProjectName.removeBracket(prjName);
|
||||
projectNameWithoutComName = StringHandler_ProjectName.getProjectWithoutComName(projectNameWithoutBracket);
|
||||
//提取项目名括号中的信息,不是项目类型
|
||||
bracket = StringHandler_ProjectName.getBracket(prjName);
|
||||
|
||||
//去除括号及括号内容后即可做为别名
|
||||
if(!"".equals(projectNameWithoutComName))
|
||||
{
|
||||
synonymsList.add(projectNameWithoutComName);
|
||||
// logger.info("gets synonyms:" + projectNameWithoutComName +
|
||||
// " Without ComName and remove racket");
|
||||
logger.info("gets synonyms:" + projectNameWithoutComName + " Without ComName and remove racket");
|
||||
}
|
||||
|
||||
// 提取项目名括号中的信息,不是项目类型
|
||||
String bracket = StringHandler_ProjectName.getBracket(prjName);
|
||||
if (bracket != null && !"".equals(bracket)) {
|
||||
bracket = RegexHandler.extractEngDecChiAndDot(bracket);// 去除括号中信息的特殊符号
|
||||
if (!RegexHandler.onlySpecialChar(bracket)) {
|
||||
if (!keywords.contains(bracket.toLowerCase())) {// 括号中的信息不是语言类关键字词
|
||||
if (StringHandler.isFullOrShort(bracket,
|
||||
projectNameWithoutBracket)) {
|
||||
synonymsList.add(bracket);
|
||||
} else {
|
||||
if (StringHandler.specialIsFullOrShort(bracket,
|
||||
projectNameWithoutBracket))
|
||||
|
||||
|
||||
try
|
||||
{
|
||||
//TODO 将对项目名称的处理过程独立为一个函数
|
||||
getSynonymsFromName(synonymsList,linkNameList,keywords);
|
||||
|
||||
if(description != null)
|
||||
getSynonymsFromDescOfNotGitprj(model,description,prjName,synonymsList,linkNameList,keywords);
|
||||
|
||||
//result包含去掉公司信息、括号内的、be动词之前的
|
||||
result = removeDupStr(linkNameList);
|
||||
|
||||
//TODO 将从linkName中提取全/缩写独立出来
|
||||
getSynonymsByFullOrShort(result,synonymsList);
|
||||
|
||||
//TODO 将从linkName中依据相似度提取别名的程序独立出来
|
||||
getSynonymsBySimilarDegree(model.getName(),result,synonymsList_notsure,keywords,"notGitPrj");
|
||||
}catch(Exception e)
|
||||
{
|
||||
logger.error("Error while extracting synonym from prj:" + model.getId() + ":" +e);
|
||||
}
|
||||
|
||||
return afterExtract(synonymsList, synonymsList_notsure);
|
||||
}
|
||||
|
||||
public void getSynonymsFromName(List<String> synonymsList,List<String> linkNameList,List<String> keywords)
|
||||
{
|
||||
if(bracket != null && projectNameWithoutBracket != null && projectNameWithoutComName != null)
|
||||
{
|
||||
// Map<String,Integer> SynonymMap = new HashMap<String,Integer>();
|
||||
if(!"".equals(bracket))
|
||||
{
|
||||
bracket = RegexHandler.extractEngDecChiAndDot(bracket);//去除括号中信息的特殊符号
|
||||
if(!RegexHandler.onlySpecialChar(bracket))
|
||||
{
|
||||
if(!keywords.contains(bracket.toLowerCase()))//括号中的信息不是语言类关键字词
|
||||
{
|
||||
//如果括号中内容个项目名存在全/缩写关系
|
||||
if(StringHandler.isFullOrShort(bracket, projectNameWithoutBracket))
|
||||
{
|
||||
synonymsList.add(bracket);
|
||||
//SynonymMap.put(bracket, 1);
|
||||
}
|
||||
else
|
||||
linkNameList.add(bracket);
|
||||
{
|
||||
if(StringHandler.specialIsFullOrShort(bracket, projectNameWithoutBracket))
|
||||
synonymsList.add(bracket);
|
||||
else linkNameList.add(bracket);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (description != null) {
|
||||
|
||||
List<String> beWords = FileReader.read("./files/beVerb.txt");// 读取be动词表
|
||||
description = StringHandler.getFirstSentence(description);
|
||||
|
||||
for (String beWord : beWords) {
|
||||
|
||||
String linkName = StringHandler.findLinkName(description,
|
||||
beWord);// 按系动词表优先级进行匹配
|
||||
if (linkName == null) {// 表示没有匹配到当前的系动词 进行下一个匹配
|
||||
continue;
|
||||
}
|
||||
if (linkName.toLowerCase().equals(prjName.toLowerCase()))
|
||||
continue;
|
||||
// 特征短语去噪
|
||||
List<String> linkNameResult = EDDHandler
|
||||
.getMainAndViceDescriptionByBracket(linkName);// 提取主副描述信息(括号中)
|
||||
linkNameResult = EDDHandler
|
||||
.getMainAndViceDescriptionByKnowAs(linkNameResult);// 提取主副描述信息(known
|
||||
// as+连词)
|
||||
linkNameResult = EDDHandler.removeComma(linkNameResult);// 去除定语描述信息(逗号定语)
|
||||
linkNameResult = EDDHandler
|
||||
.removeDemonstrativeWords(linkNameResult);// 去除定语描述信息(定冠词The)
|
||||
linkNameResult = EDDHandler
|
||||
.removeArrtibutiveClause(linkNameResult);// 去除从句描述信息
|
||||
linkNameResult = EDDHandler.removePronoun(linkNameResult);// 去除指示代词
|
||||
linkNameResult = EDDHandler.removeProject(linkNameResult);// 去除项目常用词
|
||||
linkNameResult = EDDHandler.removePPCombine(linkNameResult);// 去除指示代词和项目常用词两者的笛卡尔积
|
||||
linkNameResult = RegexHandler
|
||||
.extractEngDecChiAndDot(linkNameResult);// 提取项目中
|
||||
linkNameResult = RegexHandler.removeOnlySpecial(linkNameResult);// 删除只有特殊字符的同义词
|
||||
for (String name : linkNameResult) {
|
||||
// logger.info("be 动词之前的有: "+name);
|
||||
// be动词前不包括项目类型,且和项目名不相同
|
||||
if (!keywords.contains(name.toLowerCase().trim())
|
||||
&& !name.toLowerCase().trim()
|
||||
.equals(prjName.toLowerCase().trim())
|
||||
&& !name.toLowerCase()
|
||||
.trim()
|
||||
.equals(projectNameWithoutComName
|
||||
.toLowerCase().trim())) {
|
||||
// 这里的prjName是完整的name
|
||||
if (prjName.toLowerCase().indexOf(name.toLowerCase()) != -1) {
|
||||
String withoutShortDash = RemoveShortDash(name);
|
||||
if (bracket == null || bracket.equals(""))
|
||||
continue;
|
||||
if (name.toLowerCase()
|
||||
.equals(bracket.toLowerCase())
|
||||
|| name.toLowerCase().equals(
|
||||
withoutShortDash))
|
||||
synonymsList.add(name);
|
||||
else {
|
||||
int nameLength1 = withoutShortDash.split(" ").length;
|
||||
int nameLength2 = projectNameWithoutBracket
|
||||
.split(" ").length;
|
||||
int nameLength = name.trim().split(" ").length;
|
||||
double ratio1 = (nameLength1 / nameLength) * 1.0;
|
||||
double ratio2 = (nameLength2 / nameLength) * 1.0;
|
||||
if (nameLength1 > 1 && ratio1 > 0.6)
|
||||
synonymsList.add(name);
|
||||
else {
|
||||
if (nameLength2 > 1 && ratio2 > 0.6)
|
||||
synonymsList.add(name);
|
||||
else
|
||||
linkNameList.add(name);
|
||||
}
|
||||
}
|
||||
|
||||
// synonymsList.add(name);//问题出在这个地方,如果从discription中提取出的词在名字中有出现就直接加入,这
|
||||
// 会导致比如Ajax被识别为Ajax Implement的别名,而实际上并不是这样的。
|
||||
// logger.info("gets synonyms:" + name +
|
||||
// " from description before be verb");
|
||||
} else
|
||||
linkNameList.add(name);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
// result包含去掉公司信息、括号内的、be动词之前的
|
||||
result = removeDupStr(linkNameList);
|
||||
// logger.info("result去掉公司信息、括号内的、be动词之前的;");
|
||||
// for (int i = 0; i < result.size(); i++)
|
||||
// System.out.println("result: " + result.get(i));
|
||||
|
||||
// 去除重复的
|
||||
|
||||
String fullName = StringHandler.getFullName(
|
||||
projectNameWithoutComName.toLowerCase(), result);
|
||||
if (!fullName.equals("")) {
|
||||
synonymsList.add(fullName.toLowerCase());// 表示项目名确实是缩写 并提取到了项目全名
|
||||
// logger.info("项目的全称为: "+fullName);
|
||||
}
|
||||
|
||||
String shortName = StringHandler.getShortName(
|
||||
projectNameWithoutComName.toLowerCase(), result);
|
||||
if (!shortName.equals("")) {
|
||||
synonymsList.add(shortName.toLowerCase());// 表示项目名确实是全称 并提取到了项目缩写
|
||||
// logger.info("项目的简称为: "+shortName);
|
||||
}
|
||||
|
||||
// 同义词要和项目名称有一定的相似度
|
||||
for (int i = 0; i < result.size(); i++) {
|
||||
}
|
||||
|
||||
public void getSynonymsBySimilarDegree(String Name,List<String> result,List<String> synonymsList_notsure,List<String> keywords,String prjSource)
|
||||
{
|
||||
//同义词要和项目名称有一定的相似度
|
||||
for(int i = 0; i < result.size(); i++){
|
||||
String extract = "";
|
||||
String synonym = result.get(i);
|
||||
String[] words = synonym.split(" ");// 按照空格进行分词
|
||||
for (String word : words) {
|
||||
if (model.getName().toLowerCase().indexOf(word.toLowerCase()) >= 0) {// 查看每个词在项目名中是否出现
|
||||
String[] words = synonym.split(" ");//按照空格进行分词
|
||||
for(String word:words){
|
||||
if(Name.toLowerCase().indexOf(word.toLowerCase()) >= 0){//查看每个词在项目名中是否出现
|
||||
extract += word + " ";
|
||||
}
|
||||
}
|
||||
if (extract.length() != 0)
|
||||
if(extract.length() !=0)
|
||||
extract = extract.substring(0, extract.length() - 1);
|
||||
// logger.info("extract : "+extract);
|
||||
if (!"".equals(extract) && !keywords.contains(extract)) {
|
||||
// 提取的别名不和项目名重复
|
||||
if (!extract.toLowerCase().equals(prjName.toLowerCase().trim())
|
||||
&& !extract
|
||||
.toLowerCase()
|
||||
.trim()
|
||||
.equals(projectNameWithoutComName.toLowerCase()
|
||||
.trim())) {
|
||||
double similarDegree = SimilarDegree(extract,
|
||||
projectNameWithoutComName);
|
||||
// System.out.println("字符串间的相似度为: "+similarDegree);
|
||||
|
||||
if (similarDegree >= 0.6 && similarDegree < 1.0) {
|
||||
// logger.info("from extract after similar: "+extract);
|
||||
//logger.info("extract : "+extract);
|
||||
if(!"".equals(extract) && !keywords.contains(extract))
|
||||
{
|
||||
boolean AbleToCalcSimilar = false;
|
||||
if(prjSource.equals("github"))
|
||||
{
|
||||
String strReplace = ReplaceSpecial(getSubName(Name));
|
||||
if(!extract.toLowerCase().equals(Name.toLowerCase().trim()) && !extract.toLowerCase().trim().equals(strReplace.toLowerCase().trim()))
|
||||
AbleToCalcSimilar = true;
|
||||
}
|
||||
if(!prjSource.equals("notGitPrj"))
|
||||
{
|
||||
if(Name!= null && projectNameWithoutBracket != null)
|
||||
if(!extract.toLowerCase().equals(Name.toLowerCase().trim()) && !extract.toLowerCase().trim().equals(projectNameWithoutComName.toLowerCase().trim()))
|
||||
AbleToCalcSimilar = true;
|
||||
}
|
||||
//提取的别名不和项目名重复
|
||||
if(AbleToCalcSimilar)
|
||||
{
|
||||
double similarDegree = 0.0;
|
||||
if(!prjSource.equals("github")) similarDegree= SimilarDegree(extract,projectNameWithoutComName);
|
||||
if(prjSource.equals("github")) similarDegree = SimilarDegree(extract, Name);
|
||||
if(similarDegree >= 0.6 && similarDegree <1.0)
|
||||
{
|
||||
logger.info("from extract after similar: "+extract);
|
||||
synonymsList_notsure.add(extract);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
removeDupStr(synonymsList);// 去除同名列表中重复的
|
||||
removeDupStr(synonymsList_notsure);
|
||||
for (String s : synonymsList) {
|
||||
// logger.info("finally synonyms:" + s + " by description");
|
||||
}
|
||||
|
||||
public void getSynonymsByFullOrShort(List<String> result,List<String>synonymsList)
|
||||
{
|
||||
String fullName = StringHandler.getFullName(projectNameWithoutComName.toLowerCase(), result);
|
||||
if(!fullName.equals("")){
|
||||
synonymsList.add(fullName.toLowerCase());//表示项目名确实是缩写 并提取到了项目全名
|
||||
}
|
||||
|
||||
String shortName = StringHandler.getShortName(projectNameWithoutComName.toLowerCase(), result);
|
||||
if(!shortName.equals("")){
|
||||
synonymsList.add(shortName.toLowerCase());//表示项目名确实是全称 并提取到了项目缩写
|
||||
}
|
||||
}
|
||||
|
||||
public void getSynonymsFromDescOfNotGitprj(GatherProjectsModel model,String description,String prjName,List<String> synonymsList,List<String> linkNameList,List<String> keywords)
|
||||
{
|
||||
|
||||
for (String sure : synonymsList)
|
||||
SynonymMap.put(sure, 1);
|
||||
for (String Notsure : synonymsList_notsure)
|
||||
SynonymMap.put(Notsure, 0);
|
||||
return SynonymMap;
|
||||
|
||||
List<String> beWords = FileReader.read("./files/beVerb.txt");//读取be动词表
|
||||
//TODO 将对description的分词处理部分独立出来
|
||||
description = StringHandler.getFirstSentence(description);
|
||||
for(String beWord:beWords){
|
||||
|
||||
String linkName = StringHandler.findLinkName(description, beWord);//按系动词表优先级进行匹配
|
||||
if(linkName == null){//表示没有匹配到当前的系动词 进行下一个匹配
|
||||
continue;
|
||||
}
|
||||
if(linkName.toLowerCase().equals(prjName.toLowerCase()))
|
||||
continue;
|
||||
//特征短语去噪
|
||||
List<String> linkNameResult =getLinkNameResult(linkName);
|
||||
|
||||
handleLinkNameResult(model,linkNameResult,synonymsList,linkNameList,keywords);
|
||||
}
|
||||
|
||||
}
|
||||
public Map<String, Integer> getSynonymsFromDescOfGitprj(String description,String prjName)
|
||||
{
|
||||
List<String> keywords = FileReader.read("./files/language.txt");
|
||||
List<String> beWords = FileReader.read("./files/beVerb.txt");
|
||||
//程序中只使用description的第一句进行处理。
|
||||
String descriptionOfFirstSentence = StringHandler.getFirstSentence(description);
|
||||
//获取斜杠后名字
|
||||
String subName = getSubName(prjName);
|
||||
|
||||
//将subName中的特殊符号用空格代替
|
||||
String strReplace = ReplaceSpecial(subName);
|
||||
//将subName中的特殊符号直接删除
|
||||
String strDelete = DeleteSpecial(subName);
|
||||
List<String> linkNameList = new ArrayList<String>();
|
||||
List<String> synonymsList = new ArrayList<String>();//确定正确的别名
|
||||
List<String> synonymsList_not_sure = new ArrayList<String>();//不确定是否正确但是满足提取规则的别名
|
||||
|
||||
// int matchBewords = 0;//指示description中是否有be动词,为0则没有匹配到任何be动词,大于1则说明有匹配到be动词
|
||||
List<String> linkNameResult = new ArrayList<String>();//存放description分词后的处理结果
|
||||
int length = beWords.size();
|
||||
for(int k=0;k<length;k++)
|
||||
{
|
||||
//按系动词表优先级进行匹配
|
||||
String linkName = StringHandler.findLinkName(descriptionOfFirstSentence, beWords.get(k));
|
||||
if(linkName == null)
|
||||
{
|
||||
if(k < length-1)
|
||||
continue;
|
||||
else
|
||||
{
|
||||
if(description.length()<100)
|
||||
linkName = description;
|
||||
else linkName = description.substring(0, 100);
|
||||
}
|
||||
}
|
||||
|
||||
//如果提取出的和subName相等就不再进行接下来的判断,这在github中普遍存在
|
||||
if(linkName.toLowerCase().equals(subName.toLowerCase()))
|
||||
continue;
|
||||
|
||||
//对匹配到be动词的部分进行分词处理
|
||||
linkNameResult = getLinkNameResult(linkName);
|
||||
|
||||
if(linkNameResult != null)
|
||||
{
|
||||
//将从description中提取出的各个单词组合成长字符串
|
||||
String strFromDesc="";
|
||||
|
||||
for(String name:linkNameResult)
|
||||
{
|
||||
logger.info("be 动词之前的有: "+name);
|
||||
if(!keywords.contains(name.toLowerCase().trim()) && !name.toLowerCase().trim().equals(subName.toLowerCase().trim()) )
|
||||
{
|
||||
if(DeleteSpecial(name).equals(strDelete) && DeleteSpecial(name)!= null)
|
||||
{
|
||||
synonymsList.add(name);
|
||||
}
|
||||
else
|
||||
{
|
||||
//提取全、缩写形式的别名
|
||||
String synonymsFromGitPrj = getSynonymByFullOrShortOfGitPrj(strReplace.toLowerCase(),strDelete.toLowerCase(),name.toLowerCase());
|
||||
|
||||
if(synonymsFromGitPrj != null)
|
||||
{
|
||||
synonymsList.add(synonymsFromGitPrj);
|
||||
}
|
||||
else
|
||||
{
|
||||
linkNameList.add(name.toLowerCase());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//将经过处理的description再组装成字符串
|
||||
strFromDesc = strFromDesc + name + " ";
|
||||
}
|
||||
|
||||
strFromDesc = strFromDesc.trim();
|
||||
}
|
||||
}
|
||||
|
||||
List<String>result = removeDupStr(linkNameList);
|
||||
getSynonymsBySimilarDegree(subName, result, synonymsList_not_sure, keywords,"github");
|
||||
return afterExtract(synonymsList,synonymsList_not_sure);
|
||||
}
|
||||
|
||||
public List<String> getLinkNameResult(String linkName)
|
||||
{
|
||||
//特征短语去噪
|
||||
List<String> linkNameResult = EDDHandler.getMainAndViceDescriptionByBracket(linkName);//提取主副描述信息(括号中)
|
||||
linkNameResult = EDDHandler.getMainAndViceDescriptionByKnowAs(linkNameResult);//提取主副描述信息(known as+连词)
|
||||
linkNameResult = EDDHandler.removeComma(linkNameResult);//去除定语描述信息(逗号定语)
|
||||
linkNameResult = EDDHandler.removeDemonstrativeWords(linkNameResult);//去除定语描述信息(定冠词The)
|
||||
linkNameResult = EDDHandler.removeArrtibutiveClause(linkNameResult);//去除从句描述信息
|
||||
linkNameResult = EDDHandler.removePronoun(linkNameResult);//去除指示代词
|
||||
linkNameResult = EDDHandler.removeProject(linkNameResult);//去除项目常用词
|
||||
linkNameResult = EDDHandler.removePPCombine(linkNameResult);//去除指示代词和项目常用词两者的笛卡尔积
|
||||
linkNameResult = RegexHandler.extractEngDecChiAndDot(linkNameResult);//提取项目中
|
||||
linkNameResult = RegexHandler.removeOnlySpecial(linkNameResult);//删除只有特殊字符的同义词
|
||||
|
||||
return linkNameResult;
|
||||
}
|
||||
|
||||
public void handleLinkNameResult(GatherProjectsModel model,List<String> linkNameResult,List<String> synonymsList,List<String>linkNameList,List<String>keywords)
|
||||
{
|
||||
for(String name:linkNameResult){
|
||||
logger.info("be 动词之前的有: "+name);
|
||||
//be动词前不包括项目类型,且和项目名不相同
|
||||
if(!keywords.contains(name.toLowerCase().trim()) && !name.toLowerCase().trim().equals(model.getName().toLowerCase().trim())
|
||||
&& !name.toLowerCase().trim().equals(projectNameWithoutComName.toLowerCase().trim())){
|
||||
//这里的prjName是完整的name
|
||||
if(model.getName().toLowerCase().indexOf(name.toLowerCase()) != -1){
|
||||
String withoutShortDash = RemoveShortDash(name);
|
||||
if(name.toLowerCase().equals(bracket.toLowerCase()) || name.toLowerCase().equals(withoutShortDash))
|
||||
synonymsList.add(name);
|
||||
else
|
||||
{
|
||||
int nameLength1 = withoutShortDash.split(" ").length;
|
||||
int nameLength2 = projectNameWithoutBracket.split(" ").length;
|
||||
int nameLength = name.trim().split(" ").length;
|
||||
double ratio1= (nameLength1/nameLength) * 1.0;
|
||||
double ratio2 =(nameLength2/nameLength) * 1.0;
|
||||
if(nameLength1 > 1 && ratio1 > 0.6)
|
||||
synonymsList.add(name);
|
||||
else
|
||||
{
|
||||
if(nameLength2 > 1 && ratio2 > 0.6)
|
||||
synonymsList.add(name);
|
||||
else linkNameList.add(name);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
linkNameList.add(name);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
//取斜杠后面的字符串,如果没有斜杠就返回字符串本身
|
||||
private String getSubName(String prjName) {
|
||||
// TODO Auto-generated method stub
|
||||
String SubName = prjName;
|
||||
if(prjName.indexOf('/') == -1)
|
||||
return SubName;
|
||||
else
|
||||
SubName = prjName.substring(prjName.indexOf('/'));
|
||||
return SubName;
|
||||
}
|
||||
|
||||
public static String DeleteSpecial(String subName) {
|
||||
// TODO Auto-generated method stub
|
||||
int deleteNum=0;
|
||||
StringBuffer str=new StringBuffer(subName);
|
||||
for(int i=0;i< subName.length();i++)
|
||||
{
|
||||
//去除下划线
|
||||
if(subName.charAt(i) == '_')
|
||||
{
|
||||
str.deleteCharAt(i-deleteNum);
|
||||
deleteNum++;
|
||||
|
||||
}
|
||||
//去除短横线
|
||||
if(subName.charAt(i) == '-')
|
||||
{
|
||||
//System.out.println("字符位置在" + i + " and 字符是" + subName.charAt(i));
|
||||
str.deleteCharAt(i-deleteNum);
|
||||
deleteNum++;
|
||||
}
|
||||
if(subName.charAt(i) == '.')
|
||||
{
|
||||
//System.out.println("字符位置在" + i + " and 字符是" + subName.charAt(i));
|
||||
str.deleteCharAt(i-deleteNum);
|
||||
deleteNum++;
|
||||
}
|
||||
if(subName.charAt(i)== '/')
|
||||
{
|
||||
//System.out.println("字符位置在" + i + " and 字符是" + subName.charAt(i));
|
||||
str.deleteCharAt(i-deleteNum);
|
||||
deleteNum++;
|
||||
}
|
||||
}
|
||||
|
||||
return str.toString();
|
||||
|
||||
}
|
||||
|
||||
|
||||
public static String ReplaceSpecial(String subName) {
|
||||
// TODO Auto-generated method stub
|
||||
StringBuffer str=new StringBuffer(subName);
|
||||
for(int i=0;i<subName.length();i++)
|
||||
{
|
||||
if(subName.charAt(i)== '-')
|
||||
str.setCharAt(i, ' ');
|
||||
if(subName.charAt(i) == '_')
|
||||
str.setCharAt(i, ' ');
|
||||
}
|
||||
return str.toString();
|
||||
}
|
||||
|
||||
//将提取出的一定正确的别名及不确定是否正确的别名和其精确度对应起来,组装到Map中
|
||||
public Map<String,Integer>afterExtract(List<String> synonymsList,List<String>synonymsList_not_sure)
|
||||
{
|
||||
Map<String,Integer> SynonymMap = new HashMap<String,Integer>();
|
||||
if(synonymsList != null)
|
||||
{
|
||||
synonymsList = removeDupStr(synonymsList);
|
||||
if(synonymsList != null)
|
||||
for(String sure:synonymsList)
|
||||
{
|
||||
SynonymMap.put(sure, 1);
|
||||
}
|
||||
}
|
||||
|
||||
if(synonymsList_not_sure != null)
|
||||
{
|
||||
synonymsList_not_sure = removeDupStr(synonymsList_not_sure);
|
||||
for(String Notsure:synonymsList_not_sure)
|
||||
{
|
||||
SynonymMap.put(Notsure, 0);
|
||||
}
|
||||
}
|
||||
|
||||
return SynonymMap;
|
||||
}
|
||||
|
||||
public String getSynonymByFullOrShortOfGitPrj(String subName,String subName2,String descWithoutBracketHandled)
|
||||
{
|
||||
if(StringHandler.isFullOrShort(ReplaceSpecial(descWithoutBracketHandled).toLowerCase(), subName.toLowerCase()) ||
|
||||
StringHandler.specialIsFullOrShortForGitPrj(ReplaceSpecial(descWithoutBracketHandled).toLowerCase(), subName.toLowerCase()))
|
||||
{
|
||||
return descWithoutBracketHandled;
|
||||
//System.out.println("ReplaceSpecial提取到别名:" + descWithoutBracketHandled);
|
||||
}
|
||||
// if(StringHandler.isFullOrShort(DeleteSpecial(descWithoutBracketHandled).toLowerCase(), subName2.toLowerCase()) ||
|
||||
// StringHandler.specialIsFullOrShortForGitPrj(DeleteSpecial(descWithoutBracketHandled).toLowerCase(), subName2.toLowerCase()))
|
||||
// {
|
||||
// return descWithoutBracketHandled;
|
||||
// //System.out.println("DeleteSpecial提取到别名:" + descWithoutBracketHandled);
|
||||
// }
|
||||
return null;
|
||||
}
|
||||
public static String RemoveShortDash(String name) {
|
||||
String withoutShortDash = "";
|
||||
if (name.indexOf('-') > -1)
|
||||
|
|
|
@ -6,6 +6,7 @@ import org.apache.ibatis.annotations.Delete;
|
|||
import org.apache.ibatis.annotations.Insert;
|
||||
import org.apache.ibatis.annotations.Param;
|
||||
import org.apache.ibatis.annotations.Select;
|
||||
import org.apache.ibatis.annotations.Update;
|
||||
|
||||
import com.ossean.model.OpenSourceProject;
|
||||
import com.ossean.model.Taggings;
|
||||
|
@ -13,6 +14,12 @@ import com.ossean.model.Taggings;
|
|||
|
||||
|
||||
public interface DBDest {
|
||||
//从别名表中找出和synonym相同的别名对应的ID
|
||||
@Select("select distinct(prjId) from ${targetTable} where synonyms=#{Synonym} and flag = #{flag}")
|
||||
public Set<Integer> selectSameSynonymPrj(@Param("targetTable")String targetTable, @Param("Synonym")String Synonym,@Param("flag") int flag);
|
||||
//根据别名和项目Id更新别名表中的flag
|
||||
@Update("update ${targetTable} set flag = #{flag} where synonyms = #{synonym} and prjId != #{prjId}")
|
||||
public void updateSynoymmings(@Param("targetTable") String targetTable,@Param("synonym") String synonym,@Param("prjId") int prjId,@Param("flag")int flag);
|
||||
|
||||
//删除open_source_projects表中对应id的数据
|
||||
@Delete("delete from ${table} where id=#{id}")
|
||||
|
|
|
@ -19,6 +19,10 @@ import com.ossean.model.Tags;
|
|||
import com.ossean.model.UpdateControlProjects;
|
||||
|
||||
public interface DBSource {
|
||||
|
||||
//读取gather_projects表中的数据 根据id in来读取
|
||||
@Select("select * from ${table} where id=#{id}")
|
||||
public GatherProjectsModel getGatherProjectById(@Param("table") String table, @Param("id") int id);
|
||||
//查看指针edd_pointers表,查看处理数量
|
||||
@Select("select Pointer from ${table} where SourceTableName=#{SourceTableName} and TargetTableName=#{TargetTableName}")
|
||||
public int getPointer(@Param("table") String table, @Param("SourceTableName") String SourceTableName, @Param("TargetTableName") String TargetTableName);
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
package com.ossean.dao2;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.ibatis.annotations.Param;
|
||||
import org.apache.ibatis.annotations.Select;
|
||||
|
||||
|
@ -22,4 +24,10 @@ public interface PlatformProjectDao {
|
|||
|
||||
@Select("select id,short_name,name,stargazers_count,forks from github where url = #{url} group by url_md5 order by extracted_time desc")
|
||||
public GithubProject getGithubPrjByUrl(@Param("url") String url);
|
||||
//获取创建时间以比较项目的可信度
|
||||
@Select("select created_time from ${table} where url_md5=#{url_md5}")
|
||||
public List<String> getTime(@Param("table") String table, @Param("created_time") String created_time,@Param("url_md5") String url_md5);
|
||||
//获取star_num,download_num等判断各项目热度
|
||||
@Select("select ${star_num} from ${table} where url_md5=#{url_md5}")
|
||||
public List<Integer> getHotValue(@Param("table") String table, @Param("star_num") String star_num,@Param("url_md5") String url_md5);
|
||||
}
|
||||
|
|
|
@ -50,7 +50,6 @@ public class StringHandler {
|
|||
}
|
||||
}
|
||||
resultStrArr = tmp.toArray(new String[0]);
|
||||
//System.out.println("bangbangbang"+resultStrArr[0]+" "+resultStrArr[1]);
|
||||
return resultStrArr;
|
||||
}
|
||||
|
||||
|
@ -289,13 +288,18 @@ public class StringHandler {
|
|||
}
|
||||
|
||||
public static boolean getFullName(String shortName, String synonym,int a){
|
||||
|
||||
//查看当前的同义词是不是含不含有全称
|
||||
String[] words = synonym.toLowerCase().split(" ");
|
||||
String shortenWord = "";
|
||||
for(int i = 0; i < words.length; i++){
|
||||
String word = words[i];
|
||||
String tmp = word.substring(0, 1);//取出第一个英文字符
|
||||
String tmp = "";
|
||||
if(word.length() == 1)
|
||||
tmp = word;
|
||||
else
|
||||
if(word.length() > 0)
|
||||
tmp = word.substring(0, 1);//取出第一个英文字符
|
||||
|
||||
shortenWord += tmp;
|
||||
}
|
||||
|
||||
|
@ -358,26 +362,22 @@ public class StringHandler {
|
|||
|
||||
public static boolean specialIsFullOrShort(String shortName, String fullName)
|
||||
{
|
||||
//System.out.println("进入了特殊别名提取函数");
|
||||
//首先获取两个字符串中单词的
|
||||
String[] words_fullName= fullName.toLowerCase().split(" ");
|
||||
String[] words_shortName = shortName.toLowerCase().split(" ");
|
||||
|
||||
if(words_fullName.length==1 && words_shortName.length==1)
|
||||
{
|
||||
//System.out.println("两者长度都大于1");
|
||||
return false;
|
||||
}
|
||||
|
||||
if(words_shortName.length == 1 && words_fullName.length > 1 )
|
||||
{
|
||||
//System.out.println("fullName的长度大于1");
|
||||
if(specialIsFullOrShorDetail(shortName,fullName))
|
||||
return true;
|
||||
}
|
||||
else
|
||||
{
|
||||
//System.out.println("shortName长度大于1");
|
||||
if(specialIsFullOrShorDetail(fullName,shortName))
|
||||
return true;
|
||||
}
|
||||
|
@ -397,10 +397,15 @@ public class StringHandler {
|
|||
//要求全写中各个单词的首字母必须在缩写中出现而且是顺序出现或最多有一个不出现在缩写中。
|
||||
for(int i = 0; i < words_fullName.length; i++){
|
||||
String word = words_fullName[i].toLowerCase();
|
||||
String tmp = word.substring(0, 1);//取出第一个英文字符
|
||||
String tmp = "";//取出第一个英文字符
|
||||
if(word.length() == 1)
|
||||
tmp = word;
|
||||
else
|
||||
if(word.length() > 0)
|
||||
tmp = word.substring(0, 1);//取出第一个英文字符
|
||||
if(shortName.toLowerCase().contains(tmp))
|
||||
{
|
||||
//System.out.println("进入了详细字符串比对函数");
|
||||
|
||||
// tmpFlag = shortName.indexOf(tmp);
|
||||
// if(tmpFlag > positionFlag)
|
||||
// {
|
||||
|
@ -412,11 +417,10 @@ public class StringHandler {
|
|||
|
||||
/**
|
||||
如果满足全写中各个单词的首字母都在缩写中顺序出现就接着
|
||||
判断缩写中所有字幕是否都在写中出现
|
||||
判断缩写中所有字幕是否都在全写中出现
|
||||
*/
|
||||
if(matchCount > words_fullName.length-2)
|
||||
{
|
||||
//System.out.println("匹配个数与全写中单词个数相等");
|
||||
if(isAllContained(shortName,fullName))
|
||||
return true;
|
||||
}
|
||||
|
@ -540,4 +544,50 @@ public class StringHandler {
|
|||
return shortName;//表示描述信息中出现了缩写形式
|
||||
return "";
|
||||
}
|
||||
|
||||
public static boolean specialIsFullOrShortForGitPrj(String shortName, String fullName)
|
||||
{
|
||||
String[] splitFullName = null;
|
||||
int containedNum = 0;//记录shortName中包含在fullName中的单词的个数
|
||||
|
||||
if(fullName.indexOf("_") > -1)
|
||||
splitFullName = fullName.split("-");
|
||||
if(fullName.indexOf("-") > -1)
|
||||
splitFullName = fullName.split("_");
|
||||
else
|
||||
splitFullName = fullName.split(" ");
|
||||
|
||||
if(splitFullName != null)
|
||||
for(String tmp:splitFullName)
|
||||
if(shortName.contains(tmp))
|
||||
containedNum++;
|
||||
if(containedNum > 2)
|
||||
return false;
|
||||
|
||||
if(fullName.contains(shortName) || shortName.contains(fullName))
|
||||
return false;
|
||||
//首先获取两个字符串中单词的
|
||||
String[] words_fullName= fullName.toLowerCase().split(" ");
|
||||
String[] words_shortName = shortName.toLowerCase().split(" ");
|
||||
|
||||
if(words_fullName.length==1 && words_shortName.length==1)
|
||||
return false;
|
||||
|
||||
if(words_shortName.length == 1 && words_fullName.length > 1 )
|
||||
{
|
||||
if(specialIsFullOrShorDetail(shortName,fullName))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if(specialIsFullOrShorDetail(fullName,shortName))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue