git filter
This commit is contained in:
ronger 2017-03-06 17:42:43 +08:00
commit d7855fa22e
31 changed files with 1940 additions and 6886 deletions

View File

@ -6,6 +6,6 @@
<entry key="ext_dict">ext.dic;</entry>
<!--用户可以在这里配置自己的扩展停止词字典-->
<entry key="ext_stopwords">stopword_1.dic;code_keywords.dic;stopword.dic</entry>
<entry key="ext_stopwords"></entry>
</properties>

View File

@ -11,6 +11,11 @@
http://www.springframework.org/schema/tx/spring-tx.xsd
http://www.springframework.org/schema/context
http://www.springframework.org/schema/context/spring-context-3.0.xsd">
<bean id="jdbcTemplate" class="org.springframework.jdbc.core.JdbcTemplate">
<property name="dataSource" ref="dataSource"></property>
</bean>
<bean id="SpringApplicationContext" class="com.ossean.match.utils.ApplicationContextHelper"></bean>
<bean id="sqlSessionFactory" class="org.mybatis.spring.SqlSessionFactoryBean">
<property name="dataSource" ref="dataSource" />

View File

@ -0,0 +1,9 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">
<properties>
<comment>TableFlow</comment>
<entry key="startId">0</entry>
<entry key="endId">200000</entry>
<entry key="batchSize">5000</entry>
<entry key="newComing">true</entry>
</properties>

View File

@ -50,8 +50,12 @@ public interface ProjectDao {
@Param("endId") int endId);
// 批量获取项目filtration为1表示保留为2表示之前保留的且已处理为0标识不保留update_mark为null表示已处理或者无更新不为null表示有更新
@Select("select id,name,url,tags,synonyms from open_source_projects where id>#{startId} AND id<=#{endId} AND ((filtration = 1) or (update_mark is not null and filtration = 2)) limit #{batchSize}")
public List<Project> getBatchPrjsIncre(@Param("startId") int startId,
@Param("endId") int endId,@Param("batchSize") int batchSize);
@Select("select id,name,url,tags,synonyms from open_source_projects where (filtration = 1) or (update_mark is not null and filtration = 2) limit #{batchSize}")
public List<Project> getBatchPrjsIncre(@Param("batchSize") int batchSize);
public List<Project> getBatchPrjsIncreWithoutLimit(@Param("batchSize") int batchSize);
// 将已经匹配过的项目的update_mark和filtration设为null和2表示已匹配
@Update("update open_source_projects set update_mark = null,filtration = 2 where id = #{prjId}")

View File

@ -3,6 +3,7 @@ package com.ossean.match.dao;
import java.util.List;
import org.apache.ibatis.annotations.Insert;
import org.apache.ibatis.annotations.InsertProvider;
import org.apache.ibatis.annotations.Param;
import org.apache.ibatis.annotations.Select;
@ -17,5 +18,9 @@ public interface RelativeMemoToOpenSourceProjectDao {
//写入匹配结果
@Insert("insert into ${targetTableName} (osp_id,relative_memo_id,match_weight,match_time) values (#{item.osp_id},#{item.relative_memo_id},#{item.match_weight},now())")
public void insertMatchResult(@Param("targetTableName") String targetTableName, @Param("item") RelativeMemoToOpenSourceProject item);
//写入匹配结果
@InsertProvider(type = RelativeMemoToOpenSourceProjectDaoProvider.class, method = "insertAll")
public void insertMatchResultBatch(@Param("targetTableName") String targetTableName, @Param("list") List<RelativeMemoToOpenSourceProject> list);
}

View File

@ -0,0 +1,27 @@
package com.ossean.match.dao;
import java.text.MessageFormat;
import java.util.List;
import java.util.Map;
import com.ossean.match.model.RelativeMemoToOpenSourceProject;
public class RelativeMemoToOpenSourceProjectDaoProvider {
public String insertAll(Map map) {
List<RelativeMemoToOpenSourceProject> list = (List<RelativeMemoToOpenSourceProject>) map.get("list");
String tableName = (String) map.get("targetTableName");
StringBuilder sb = new StringBuilder();
sb.append("INSERT INTO "+tableName+" ");
sb.append("(osp_id,relative_memo_id,match_weight,match_time) ");
sb.append("VALUES ");
MessageFormat mf = new MessageFormat("(#'{'list[{0}].osp_id},#'{'list[{0}].relative_memo_id},#'{'list[{0}].match_weight},now())");
for (int i = 0; i < list.size(); i++) {
sb.append(mf.format(new Integer[]{i}));
if (i < list.size() - 1) {
sb.append(",");
}
}
return sb.toString();
}
}

View File

@ -55,11 +55,9 @@ public class Main {
newPrjMonitor.run();
prjToMemoMatch.run();
memoToPrjMatch.run();
if (matchIncrement.getSleepTime() > 0) {
logger.info(".........sleeping.........." + matchIncrement.getSleepTime()/1000 + "s......");
}
Thread.sleep(matchIncrement.getSleepTime());
logger.info(".........sleeping.........." + 1000*60 + "s......");
Thread.sleep(1000*60);
}
}
}

View File

@ -2,6 +2,10 @@ package com.ossean.match.matchprocess;
import java.io.IOException;
import java.nio.file.Paths;
import java.text.MessageFormat;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@ -12,6 +16,10 @@ import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.context.ApplicationContext;
import org.springframework.context.support.ClassPathXmlApplicationContext;
import org.springframework.dao.DataAccessException;
import org.springframework.jdbc.core.JdbcTemplate;
import org.springframework.stereotype.Component;
import org.springframework.transaction.annotation.Propagation;
import org.springframework.transaction.annotation.Transactional;
@ -23,6 +31,7 @@ import com.ossean.match.lucene.LuceneSearch;
import com.ossean.match.model.Project;
import com.ossean.match.model.RelativeMemo;
import com.ossean.match.model.RelativeMemoToOpenSourceProject;
import com.ossean.match.utils.ApplicationContextHelper;
import com.ossean.match.utils.Normalizer;
import javax.annotation.Resource;
@ -75,10 +84,12 @@ public class Match {
// 项目名匹配帖子标题
matchMap = LuceneSearch.prjToMemoMatchByLucene(prjName,
LuceneIndex.titleFieldName, titleWeight, matchMap, memoIndexReader);
//System.out.println("项目名匹配帖子标题"+matchMap.size());
// 项目名匹配帖子标签
matchMap = LuceneSearch.searchMemoTags(
prjName, LuceneIndex.memoTagsFieldName, LuceneIndex.memoIdFieldName, memoTagsWeight,
matchMap, memoIndexReader);
//System.out.println("项目名匹配帖子标签"+matchMap.size());
}
if (project.getSynonyms() != null) {
synonyms = project.getSynonyms().toLowerCase();
@ -89,12 +100,14 @@ public class Match {
matchMap = LuceneSearch.prjToMemoMatchByLucene(synonym,
LuceneIndex.titleFieldName, titleWeight, matchMap, memoIndexReader);
}
//System.out.println("项目别名匹配帖子标题 "+matchMap.size());
//项目别名匹配帖子标签 TODO
for (String synonym : synonymsList) {
matchMap = LuceneSearch.searchMemoTags(
synonym, LuceneIndex.memoTagsFieldName, LuceneIndex.memoIdFieldName, memoTagsWeight,
matchMap, memoIndexReader);
}
//System.out.println("项目别名匹配帖子标签 "+matchMap.size());
}
}
if (prjTags != null && prjTags.length() > 0 && matchMap.size() > 0) {
@ -105,12 +118,14 @@ public class Match {
prjTag, LuceneIndex.titleFieldName, LuceneIndex.memoIdFieldName, prjTagsToMemoTitleWeight,
matchMap, memoIndexReader);
}
//System.out.println("用项目标签搜索帖子标题 "+matchMap.size());
// 用项目标签搜索帖子标签
for (String prjTag : prjTagsList) {
matchMap = LuceneSearch.searchByPrjTag(
prjTag, LuceneIndex.memoTagsFieldName, LuceneIndex.memoIdFieldName, prjTagsToMemoTagsWeight,
matchMap, memoIndexReader);
}
//System.out.println("用项目标签搜索帖子标签 "+matchMap.size());
}
if (matchMap.size() > 0)
insertPrjToMemoMatchResult(prjId, matchMap);
@ -132,6 +147,8 @@ public class Match {
*/
@Transactional(propagation=Propagation.REQUIRES_NEW)
public void insertPrjToMemoMatchResult(int prjId, HashMap<Integer, Double> matchMap) {
//long start = System.currentTimeMillis();
List<RelativeMemoToOpenSourceProject> list = new ArrayList<RelativeMemoToOpenSourceProject>();
for (Map.Entry<Integer, Double> entry : matchMap.entrySet()) {
try {
int memoId = entry.getKey();
@ -141,12 +158,38 @@ public class Match {
matchResult.setOsp_id(prjId);
matchResult.setRelative_memo_id(memoId);
matchResult.setMatch_weight(weight);
matchResultDao.insertMatchResult(getTargetTable(prjId), matchResult);
list.add(matchResult);
//matchResultDao.insertMatchResult(getTargetTable(prjId), matchResult);
}
if(list.size()==800 || list.size()==matchMap.size()){
batchInsertJDBC(list,getTargetTable(prjId));
//matchResultDao.insertMatchResultBatch(getTargetTable(prjId), list);
list.clear();
}
} catch (Exception e) {
logger.error("insertPrjToMemoMatchResult error: " + e);
}
}
//System.out.println(prjId+" current insert time cost:"+(System.currentTimeMillis()-start)/1000+" seconds");
}
@Transactional()
public void batchInsertJDBC(List<RelativeMemoToOpenSourceProject> list,String tableName) throws DataAccessException {
JdbcTemplate jdbcTemplate=(JdbcTemplate)ApplicationContextHelper.getBean("jdbcTemplate");
StringBuilder sqlbuf = new StringBuilder()
.append("insert into "+tableName+" (osp_id,relative_memo_id,match_weight,match_time) values ");
for (RelativeMemoToOpenSourceProject prj :list) {
SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");//设置日期格式
sqlbuf.append("("+prj.getOsp_id()+","+prj.getRelative_memo_id()+","
+prj.getMatch_weight()+",'"+df.format(new Date())+"')");
sqlbuf.append(",");
}
String sql = sqlbuf.toString();
sql = sql.substring(0, sql.length()-1);
jdbcTemplate.update(sql);
}
/**
@ -163,39 +206,8 @@ public class Match {
int a = 1 + ospId/11000;
targetTableName = "relative_memo_to_open_source_projects_" + a;
}
// if (osp_id < 500) {
// targetTableName = "relative_memo_to_open_source_projects_1";
// }
// else if (osp_id >= 500 && osp_id < 1000) {
// targetTableName = "relative_memo_to_open_source_projects_2";
// }
// else if (osp_id >= 1000 && osp_id < 1500) {
// targetTableName = "relative_memo_to_open_source_projects_3";
// }
// else if (osp_id >= 1500 && osp_id < 2000) {
// targetTableName = "relative_memo_to_open_source_projects_4";
// }
// else if (osp_id >= 2000 && osp_id < 3000) {
// targetTableName = "relative_memo_to_open_source_projects_5";
// }
// else if (osp_id >= 3000 && osp_id < 5000) {
// targetTableName = "relative_memo_to_open_source_projects_6";
// }
// else if (osp_id >= 5000 && osp_id < 7500) {
// targetTableName = "relative_memo_to_open_source_projects_7";
// }
// else if (osp_id >= 7500 && osp_id < 10000) {
// targetTableName = "relative_memo_to_open_source_projects_8";
// }
// else if (osp_id >= 10000 && osp_id < 310000) {
// int a = 7 + osp_id/5000;
// targetTableName = "relative_memo_to_open_source_projects_" + a;
// }
// else if (osp_id >= 310000) {
// targetTableName = "relative_memo_to_open_source_projects_70";
// }
return targetTableName;
//return "relative_memo_to_open_source_projects";
}
/**
@ -229,9 +241,9 @@ public class Match {
memoTags = memo.getTags().toLowerCase();
}
memoTitle = memo.getTitle().toLowerCase();
if (memoTitle.length() == 0) {
continue;
}
// if (memoTitle.length() == 0) {
// continue;
// }
if (Normalizer.isAllNumber(memoTitle)) {
continue;
}

View File

@ -13,56 +13,16 @@ public class MatchIncrement extends Thread implements Runnable {
private Logger logger = LoggerFactory.getLogger(getClass());
private int memoHistory; // the record of memo after matching last time
private static File record; // record file "record.txt"
private boolean newPrjComing; //新项目进入标识
private boolean newMemoComing; //新帖子进入标识
private boolean currentMatchDone = true; //当前批量匹配是否结束
private boolean newPrjComing; // 新项目进入标识
private boolean newMemoComing; // 新帖子进入标识
private boolean currentMatchDone = true; // 当前批量匹配是否结束
private int sleepTime = 0;
public boolean isNewPrjComing() {
return newPrjComing;
}
public void setNewPrjComing(boolean newPrjComing) {
this.newPrjComing = newPrjComing;
}
public boolean isNewMemoComing() {
return newMemoComing;
}
public void setNewMemoComing(boolean newMemoComing) {
this.newMemoComing = newMemoComing;
}
public int getSleepTime() {
return sleepTime;
}
public void setSleepTime(int sleepTime) {
this.sleepTime = sleepTime;
}
public MatchIncrement() {
record = new File("record.txt");
getMatchRecord();
}
public int getMemoHistory() {
return memoHistory;
}
public synchronized void setMemoHistory(int memoHistory) {
this.memoHistory = memoHistory;
}
public boolean isCurrentMatchDone() {
return currentMatchDone;
}
public synchronized void setCurrentMatchDone(boolean currentMatchDone) {
this.currentMatchDone = currentMatchDone;
}
/**
* read the record after matching last time from record file "record.txt",
* the format of the record in the file: "project's record" "memo's record"
@ -91,4 +51,45 @@ public class MatchIncrement extends Thread implements Runnable {
logger.error("writeIntoRecord FileNotFoundException " + e);
}
}
public boolean isNewPrjComing() {
return newPrjComing;
}
public void setNewPrjComing(boolean newPrjComing) {
this.newPrjComing = newPrjComing;
}
public boolean isNewMemoComing() {
return newMemoComing;
}
public void setNewMemoComing(boolean newMemoComing) {
this.newMemoComing = newMemoComing;
}
public int getSleepTime() {
return sleepTime;
}
public void setSleepTime(int sleepTime) {
this.sleepTime = sleepTime;
}
public int getMemoHistory() {
return memoHistory;
}
public synchronized void setMemoHistory(int memoHistory) {
this.memoHistory = memoHistory;
}
public boolean isCurrentMatchDone() {
return currentMatchDone;
}
public synchronized void setCurrentMatchDone(boolean currentMatchDone) {
this.currentMatchDone = currentMatchDone;
}
}

View File

@ -14,6 +14,7 @@ import org.springframework.stereotype.Component;
import com.ossean.match.dao.ProjectDao;
import com.ossean.match.lucene.LuceneIndex;
import com.ossean.match.model.Configure;
import com.ossean.match.model.Project;
@Component("prjtomemomatch")
@ -26,16 +27,25 @@ public class PrjToMemoMatch implements Runnable {
@Qualifier("match")
@Autowired
private Match match;
private Configure conf;
private String configureName = "configure";
public void setMatchIncrement(MatchIncrement matchIncrement) {
this.matchIncrement = matchIncrement;
}
public void prjToMemoIncrement() throws IOException, SQLException {
public void prjToMemoIncrement(boolean isnew) throws IOException, SQLException {
matchIncrement.setCurrentMatchDone(false);
matchIncrement.getMatchRecord();
conf = new Configure(configureName);
List<Project> newPrjs;
// 批量读取filtration为1或者filtration为2且update_mark不为空的项目即未匹配的和已匹配但有更新的
List<Project> newPrjs = projectDao.getBatchPrjsIncre(step);
if(isnew)
newPrjs = projectDao.getBatchPrjsIncreWithoutLimit(step);
else
newPrjs = projectDao.getBatchPrjsIncre(conf.getStartId(),conf.getEndId(),conf.getBatchSize());
while (newPrjs != null && newPrjs.size() > 0) {
if (newPrjs != null) {
match.prjToMemoMatch(newPrjs);
@ -44,16 +54,33 @@ public class PrjToMemoMatch implements Runnable {
luceneIndex.prjIndexIncrement(newPrjs);
}
// getting projects' id and prjName in batches
newPrjs = projectDao.getBatchPrjsIncre(step);
if(isnew)
newPrjs = projectDao.getBatchPrjsIncreWithoutLimit(step);
else
newPrjs = projectDao.getBatchPrjsIncre(conf.getStartId(),conf.getEndId(),conf.getBatchSize());
}
matchIncrement.setCurrentMatchDone(true);
}
public void run() {
if (matchIncrement.isNewPrjComing()) {
logger.info("start osps_to_relative_memos match!!!");
conf = new Configure(configureName);
boolean newcoming = conf.isNewComing();
if (newcoming) {//对于正常匹配程序处理新数据流
if(matchIncrement.isNewPrjComing()){
logger.info("start osps_to_relative_memos match for new prj");
try {
prjToMemoIncrement(true);
} catch (IOException e) {
logger.error("IOException: " + e);
} catch (SQLException e) {
logger.error("SQLException: " + e);
}
}
}
else{
logger.info("start osps_to_relative_memos match with multi thread");
try {
prjToMemoIncrement();
prjToMemoIncrement(false);
} catch (IOException e) {
logger.error("IOException: " + e);
} catch (SQLException e) {

View File

@ -0,0 +1,98 @@
package com.ossean.match.model;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.ArrayList;
import java.util.InvalidPropertiesFormatException;
import java.util.List;
import java.util.Properties;
import org.apache.log4j.Logger;
public class Configure {
private Properties prop = new Properties();
Logger log4j = Logger.getLogger(Configure.class);
private int startId;
private int endId;
private int batchSize;
private boolean newComing;
//自定义构造函数
public Configure(String confPath) {
confPath = confPath + ".xml";
try {
URL url = ClassLoader.getSystemResource(confPath);
InputStream is = url.openStream();
prop.loadFromXML(is);
} catch (FileNotFoundException e) {
log4j.error("Configure FileNotFoundException !");
e.printStackTrace();
} catch (InvalidPropertiesFormatException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
config();
}
private void config() {
this.startId = Integer.parseInt(prop.getProperty("startId"));
this.endId = Integer.parseInt(prop.getProperty("endId"));
this.batchSize = Integer.parseInt(prop.getProperty("batchSize"));
this.newComing = Boolean.parseBoolean(prop.getProperty("newComing"));
}
public Properties getProp() {
return prop;
}
public void setProp(Properties prop) {
this.prop = prop;
}
public Logger getLog4j() {
return log4j;
}
public void setLog4j(Logger log4j) {
this.log4j = log4j;
}
public int getStartId() {
return startId;
}
public void setStartId(int startId) {
this.startId = startId;
}
public int getEndId() {
return endId;
}
public void setEndId(int endId) {
this.endId = endId;
}
public int getBatchSize() {
return batchSize;
}
public void setBatchSize(int batchSize) {
this.batchSize = batchSize;
}
public boolean isNewComing() {
return newComing;
}
public void setNewComing(boolean newComing) {
this.newComing = newComing;
}
}

View File

@ -0,0 +1,36 @@
package com.ossean.match.utils;
import org.springframework.beans.BeansException;
import org.springframework.context.ApplicationContext;
import org.springframework.context.ApplicationContextAware;
public class ApplicationContextHelper implements ApplicationContextAware{
private static ApplicationContext appCtx;
/**
* 此方法可以把ApplicationContext对象inject到当前类中作为一个静态成员变量
* @param applicationContext ApplicationContext 对象.
* @throws BeansException
*/
@Override
public void setApplicationContext( ApplicationContext applicationContext ) throws BeansException {
appCtx = applicationContext;
}
/**
* 获取ApplicationContext
* @return
*/
public static ApplicationContext getApplicationContext(){
return appCtx;
}
/**
* 这是一个便利的方法帮助我们快速得到一个BEAN
* @param beanName bean的名字
* @return 返回一个bean对象
*/
public static Object getBean(String beanName) {
return appCtx.getBean(beanName);
}
}

View File

@ -64,15 +64,17 @@ public class Normalizer {
}
public static void main(String []args) {
String content = "大声道Corvette123 *&%9&^&%&^%(*_asd's_12 L'Aur--ore, 1766-1775";
int a = 1 + 11000/11000;
String targetTableName = "relative_memo_to_open_source_projects_" + a;
//System.out.println(StringFilter(content));
String result = content.replaceAll("[^0-9a-zA-Z]"," ");
System.out.println(result);
System.out.println(normalize(content));
System.out.println(content.lastIndexOf(" "));
//System.out.println(StringFilter(content));
// String content = "大声道Corvette123 *&%9&^&%&^%(*_asd's_12 L'Aur--ore, 1766-1775";
// int a = 1 + 11000/11000;
// String targetTableName = "relative_memo_to_open_source_projects_" + a;
// //System.out.println(StringFilter(content));
// String result = content.replaceAll("[^0-9a-zA-Z]"," ");
// System.out.println(result);
// System.out.println(normalize(content));
// System.out.println(content.lastIndexOf(" "));
// //System.out.println(StringFilter(content));
String a = "'One' PHP cache";
System.out.println(getList(a));
}
/**
* if a string only contains digital

View File

@ -6,6 +6,6 @@
<entry key="ext_dict">ext.dic;</entry>
<!--用户可以在这里配置自己的扩展停止词字典-->
<entry key="ext_stopwords">stopword_1.dic;code_keywords.dic</entry>
<entry key="ext_stopwords"></entry>
</properties>

View File

@ -11,6 +11,11 @@
http://www.springframework.org/schema/tx/spring-tx.xsd
http://www.springframework.org/schema/context
http://www.springframework.org/schema/context/spring-context-3.0.xsd">
<bean id="jdbcTemplate" class="org.springframework.jdbc.core.JdbcTemplate">
<property name="dataSource" ref="dataSource"></property>
</bean>
<bean id="SpringApplicationContext" class="com.ossean.match.utils.ApplicationContextHelper"></bean>
<bean id="sqlSessionFactory" class="org.mybatis.spring.SqlSessionFactoryBean">
<property name="dataSource" ref="dataSource" />
@ -27,10 +32,10 @@
class="org.springframework.jdbc.datasource.DataSourceTransactionManager">
<property name="dataSource" ref="dataSource" />
</bean>
<!-- enable transaction annotation support -->
<tx:annotation-driven transaction-manager="txManager" />
<bean id="dataSource" class="org.apache.commons.dbcp.BasicDataSource"
destroy-method="close">

View File

@ -0,0 +1,697 @@
private
protected
public
abstract
class
extends
final
implements
interface
native
new
static
strictfp
synchronized
transient
volatile
break
continue
return
do
while
if
else
for
instanceof
switch
case
defult
catch
finally
throw
throws
try
import
package
boolean
byte
char
double
float
int
long
short
null
true
false
super
this
void
form
do
sourceforge
com
cn
www
http
js
script
a
test
error
exception
about
website
able
abstract
console
sql
windows
exe
txt
doc
xls
local
net
web
server
from
using
error
can
date
file
xml
can
system
url
value
create
text
set
get
list
table
select
distinct
object
open
close
clear
all
time
have
org
main
start
end
version
private
public
index
api
method
source
root
content
write
read
view
one
page
run
log
win
bin
first
next
src
app
request
println
namespace
printf
button
title
local
define
lib
make
files
help
article
bool
boolean
config
load
args
date
thread
cpu
more
document
util
info
home
style
body
print
hello
world
find
left
std
debug
etc
like
top
now
map
context
other
post
format
client
encoding
session
program
datebase
control
language
base
process
want
values
used
status
project
color
array
loaclhost
please
click
integer
example
archive
response
command
event
param
what
core
build
link
display
copy
stdio
release
mode
after
check
cpp
c
port
target
should
library
lib
tcp
udp
state
sum
free
last
connect
configuration
none
download
software
buffer
query
bit
temp
word
block
two
send
device
layout
sdk
cache
alert
see
change
work
kernel
filter
handel
save
tools
min
count
header
level
framework
sys
email
push
pull
heap
stack
user
must
demo
just
serach
res
tmp
down
option
img
network
model
stop
problem
position
node
space
login
font
todo
background
resource
mac
bytes
ios
empty
note
self
tag
column
studio
red
done
baidu
sleep
convert
global
field
reference
way
settings
simple
ctrl
wait
meta
edit
runtime
store
equals
services
bean
ftp
exec
non
enter
lock
admin
configure
png
jpg
abc
loop
vector
setup
mail
flag
machine
sample
share
image
ref
join
where
parameter
users
contain
know
template
class
math
pdf
parse
timeout
cat
through
basic
invoke
resources
media
uri
paltform
err
sudo
range
menu
mapping
tables
master
memset
reset
environment
pop
commit
report
task
active
domain
good
step
setting
day
hash
move
collections
phone
engine
callback
datetime
objects
mobile
profile
always
storage
alt
trim
applications
via
design
nothing
makefile
clean
small
solution
custom
people
projects
family
attributes
enum
own
works
messages
low
signal
browser
give
aaa
event
assert
look
company
person
either
case
desktop
touch
book
unit
docs
params
dump
pool
feature
hashmap
seconds
owner
goto
keys
editor
env
month
year
loading
changes
black
fetch
guide
enterprise
multi
foundation
reflect
unique
upload
play
idea
exist
elements
account
numbers
examples
plain
solid
game
hide
apps
symbol
layer
least
weight
beta
jpeg
attr
refresh
apply
focus
fun
cfg
threads
receive
recieved
easy
hard
mode
logs
password
passwd
packages
ignore
scripts
cell
later
plus
his
pub
going
logging
channel
think
serial
things
loader
player
libs
team
backup
conector
side
utf8
speed
doing
price
timestamp
repository
init
HelloWorld
tesing
scan
components
speed
unlock
products
mid
said
token
optional
today
rest
pack
coding
contain
points
days
safe
typeof
community
others
manual
sets
closed
fine
groups
specify
sources
workder
problems
perform
servers
articles
skip
blocks
connections
related
notify
onload
known
assembly
useful
monitor
thing
forum
come
notification
rules
developers
cross
notice
soft
progress
fork
hosts
logger
role
everything
future
references
jni
locale
follow
words
updates
necessary
writer
mouse
sign
repoter
containing
route
analysis
four
comments
programs
maps
turn
versions
drive
parser
track
money
utils
reload
decimal
resume
behavior
hand
board
light
regex
terminal
reduce
requests
fff
0xfffff
strong
edge
schedule
pair
swf
tom
dependency
rule
escape
hook
temporary
issues
exchange
review
various
life
friend
face
complex
readme
prop
expert
extend
often
automatic
capture
difference
scheme
develop
together
steps
reply
beginning
understand
identified
wall
along
market
minute
hour
university
plan
templates
codes
ccc
diff
ppt
study
abcd
lost
choice
explain
likely
guid
taken
period
FAQ
symbols
thus
tips
compute
school
languages
describe
resolution
deal
detect
yellow
hope
visit
xyz
iii
dog
jump
knowledge
publish
inserted
levels
chip
preference
done
adjust
earth
expressions
house
central
rich
chat
sessions
deep
when
where
how
who
仅供参考

View File

@ -0,0 +1,9 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">
<properties>
<comment>TableFlow</comment>
<entry key="startId">0</entry>
<entry key="endId">200000</entry>
<entry key="batchSize">5000</entry>
<entry key="newComing">false</entry>
</properties>

View File

@ -33,703 +33,4 @@ will
with
一个
codekeywords:
private
protected
public
abstract
class
extends
final
implements
interface
native
new
static
strictfp
synchronized
transient
volatile
break
continue
return
do
while
if
else
for
instanceof
switch
case
defult
catch
finally
throw
throws
try
import
package
boolean
byte
char
double
float
int
long
short
null
true
false
super
this
void
form
do
sourceforge
com
cn
www
http
js
script
a
test
error
exception
about
website
able
abstract
console
sql
windows
exe
txt
doc
xls
local
net
web
server
from
using
error
can
date
file
xml
can
system
url
value
create
text
set
get
list
table
select
distinct
object
open
close
clear
all
time
have
org
main
start
end
version
private
public
index
api
method
source
root
content
write
read
view
one
page
run
log
win
bin
first
next
src
app
request
println
namespace
printf
button
title
local
define
lib
make
files
help
article
bool
boolean
config
load
args
date
thread
cpu
more
document
util
info
home
style
body
print
hello
world
find
left
std
debug
etc
like
top
now
map
context
other
post
format
client
encoding
session
program
datebase
control
language
base
process
want
values
used
status
project
color
array
loaclhost
please
click
integer
example
archive
response
command
event
param
what
core
build
link
display
copy
stdio
release
mode
after
check
cpp
c
port
target
should
library
lib
tcp
udp
state
sum
free
last
connect
configuration
none
download
software
buffer
query
bit
temp
word
block
two
send
device
layout
sdk
cache
alert
see
change
work
kernel
filter
handel
save
tools
min
count
header
level
framework
sys
email
push
pull
heap
stack
user
must
demo
just
serach
res
tmp
down
option
img
network
model
stop
problem
position
node
space
login
font
todo
background
resource
mac
bytes
ios
empty
note
self
tag
column
studio
red
done
baidu
sleep
convert
global
field
reference
way
settings
simple
ctrl
wait
meta
edit
runtime
store
equals
services
bean
ftp
exec
non
enter
lock
admin
configure
png
jpg
abc
loop
vector
setup
mail
flag
machine
sample
share
image
ref
join
where
parameter
users
contain
know
template
class
math
pdf
parse
timeout
cat
through
basic
invoke
resources
media
uri
paltform
err
sudo
range
menu
mapping
tables
master
memset
reset
environment
pop
commit
report
task
active
domain
good
step
setting
day
hash
move
collections
phone
engine
callback
datetime
objects
mobile
profile
always
storage
alt
trim
applications
via
design
nothing
makefile
clean
small
solution
custom
people
projects
family
attributes
enum
own
works
messages
low
signal
browser
give
aaa
event
assert
look
company
person
either
case
desktop
touch
book
unit
docs
params
dump
pool
feature
hashmap
seconds
owner
goto
keys
editor
env
month
year
loading
changes
black
fetch
guide
enterprise
multi
foundation
reflect
unique
upload
play
idea
exist
elements
account
numbers
examples
plain
solid
game
hide
apps
symbol
layer
least
weight
beta
jpeg
attr
refresh
apply
focus
fun
cfg
threads
receive
recieved
easy
hard
mode
logs
password
passwd
packages
ignore
scripts
cell
later
plus
his
pub
going
logging
channel
think
serial
things
loader
player
libs
team
backup
conector
side
utf8
speed
doing
price
timestamp
repository
init
HelloWorld
tesing
scan
components
speed
unlock
products
mid
said
token
optional
today
rest
pack
coding
contain
points
days
safe
typeof
community
others
manual
sets
closed
fine
groups
specify
sources
workder
problems
perform
servers
articles
skip
blocks
connections
related
notify
onload
known
assembly
useful
monitor
thing
forum
come
notification
rules
developers
cross
notice
soft
progress
fork
hosts
logger
role
everything
future
references
jni
locale
follow
words
updates
necessary
writer
mouse
sign
repoter
containing
route
analysis
four
comments
programs
maps
turn
versions
drive
parser
track
money
utils
reload
decimal
resume
behavior
hand
board
light
regex
terminal
reduce
requests
fff
0xfffff
strong
edge
schedule
pair
swf
tom
dependency
rule
escape
hook
temporary
issues
exchange
review
various
life
friend
face
complex
readme
prop
expert
extend
often
automatic
capture
difference
scheme
develop
together
steps
reply
beginning
understand
identified
wall
along
market
minute
hour
university
plan
templates
codes
ccc
diff
ppt
study
abcd
lost
choice
explain
likely
guid
taken
period
FAQ
symbols
thus
tips
compute
school
languages
describe
resolution
deal
detect
yellow
hope
visit
xyz
iii
dog
jump
knowledge
publish
inserted
levels
chip
preference
done
adjust
earth
expressions
house
central
rich
chat
sessions
deep
when
where
how
who
仅供参考

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">
<properties>
<comment>TableFlow</comment>
<entry key="startTableId">1</entry>
<entry key="endTableId">20</entry>
</properties>

View File

@ -0,0 +1,78 @@
package com.ossean.projectmanager.model;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.ArrayList;
import java.util.InvalidPropertiesFormatException;
import java.util.List;
import java.util.Properties;
import org.apache.log4j.Logger;
public class Configure {
private Properties prop = new Properties();
Logger log4j = Logger.getLogger(Configure.class);
private int startTableId;
private int endTableId;
//自定义构造函数
public Configure(String confPath) {
confPath = confPath + ".xml";
try {
URL url = ClassLoader.getSystemResource(confPath);
InputStream is = url.openStream();
prop.loadFromXML(is);
} catch (FileNotFoundException e) {
log4j.error("Configure FileNotFoundException !");
e.printStackTrace();
} catch (InvalidPropertiesFormatException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
config();
}
private void config() {
this.startTableId = Integer.parseInt(prop.getProperty("startTableId"));
this.endTableId = Integer.parseInt(prop.getProperty("endTableId"));
}
public Properties getProp() {
return prop;
}
public void setProp(Properties prop) {
this.prop = prop;
}
public Logger getLog4j() {
return log4j;
}
public void setLog4j(Logger log4j) {
this.log4j = log4j;
}
public int getStartTableId() {
return startTableId;
}
public void setStartTableId(int startTableId) {
this.startTableId = startTableId;
}
public int getEndTableId() {
return endTableId;
}
public void setEndTableId(int endTableId) {
this.endTableId = endTableId;
}
}

View File

@ -9,6 +9,7 @@ import org.springframework.stereotype.Component;
import com.ossean.projectmanager.lasttabledao.RelativeMemoDao;
import com.ossean.projectmanager.lasttabledao.RelativeMemoToOpenSourceProjectDao;
import com.ossean.projectmanager.model.Configure;
import com.ossean.projectmanager.model.RelativeMemo;
import com.ossean.projectmanager.model.RelativeMemoToOpenSourceProject;
@ -21,6 +22,7 @@ public class SyncMatchResult {
private RelativeMemoToOpenSourceProjectDao matchResultDao;
Logger logger = Logger.getLogger(this.getClass());
private Configure conf;
/**
* 从帖子表中同步帖子信息到匹配结果表
@ -28,39 +30,32 @@ public class SyncMatchResult {
* @throws InterruptedException
*/
public void syncMatchResult() {
while (true) {
// int notSyncNum = matchResultDao.getNotSyncCount("relative_memo_to_open_source_projects_1");
// if (notSyncNum < 1) {
// try {
// logger.info("sleeping 10s......");
// Thread.sleep(10000);
// } catch (InterruptedException e) {
// logger.error(e);
// }
// }
for (int i = 1; i <= 70; i++) {
String matchResultTableName = "relative_memo_to_open_source_projects_" + String.valueOf(i);
//logger.info("starting sync " + matchResultTableName + "......");
//long startReadMT = System.currentTimeMillis();
List<RelativeMemoToOpenSourceProject> rmtospList = matchResultDao.getMatchResultsByHassync(matchResultTableName, 50000);
//long endReadMT = System.currentTimeMillis();
//logger.info("get rmtospList " + (endReadMT - startReadMT)/6000 + " s");
if (rmtospList !=null && rmtospList.size() > 0) {
for (RelativeMemoToOpenSourceProject rmtosp : rmtospList) {
int curMemoId = rmtosp.getRelative_memo_id();
RelativeMemo curMemo = lastMemoDao.getById(curMemoId);
rmtosp.setCreated_time(curMemo.getCreated_time());
rmtosp.setMemo_type(curMemo.getMemo_type());
rmtosp.setReplies_num(curMemo.getReplies_num());
rmtosp.setView_num_crawled(curMemo.getView_num());
matchResultDao.syncMatchResult(matchResultTableName, rmtosp);
logger.info("updated " + rmtosp.getId() + "......");
}
conf = new Configure("configure");
int startTableId = conf.getStartTableId();
int endTableId = conf.getEndTableId();
for (int i = startTableId; i <= endTableId; i++) {
String matchResultTableName = "relative_memo_to_open_source_projects_" + String.valueOf(i);
logger.info("starting sync " + matchResultTableName + "......");
List<RelativeMemoToOpenSourceProject> rmtospList = matchResultDao.getMatchResultsByHassync(matchResultTableName, 50000);
long startCircle = System.currentTimeMillis();
while(rmtospList !=null && rmtospList.size() > 0){
for (RelativeMemoToOpenSourceProject rmtosp : rmtospList) {
int curMemoId = rmtosp.getRelative_memo_id();
RelativeMemo curMemo = lastMemoDao.getById(curMemoId);
rmtosp.setCreated_time(curMemo.getCreated_time());
rmtosp.setMemo_type(curMemo.getMemo_type());
rmtosp.setReplies_num(curMemo.getReplies_num());
rmtosp.setView_num_crawled(curMemo.getView_num());
matchResultDao.syncMatchResult(matchResultTableName, rmtosp);
}
//long endCircle = System.currentTimeMillis();
//logger.info("endCircle : " + (endCircle - endReadMT)/6000 +" s");
logger.info("updated " + rmtospList.get(rmtospList.size()-1).getId() + "......");
rmtospList = matchResultDao.getMatchResultsByHassync(matchResultTableName, 50000);
}
}
long endCircle = System.currentTimeMillis();
logger.info(matchResultTableName+" sync stop cost : " + (endCircle - startCircle)/6000 +" s");
}
logger.info("all match table have sync over!!!!");
}

View File

@ -21,7 +21,7 @@
<property name="url"
value="jdbc:mysql://localhost:3306/ossean_production?characterEncoding=UTF-8&amp;zeroDateTimeBehavior=convertToNull&amp;autoReconnect=true" />
<property name="username" value="root" />
<property name="password" value="1234" />
<property name="password" value="123456" />
<property name="validationQuery" value="SELECT 1" />
<property name="testOnBorrow" value="true"/>
</bean>
@ -41,7 +41,7 @@
<property name="url"
value="jdbc:mysql://localhost:3306/ossean_production?characterEncoding=UTF-8&amp;zeroDateTimeBehavior=convertToNull&amp;autoReconnect=true" />
<property name="username" value="root" />
<property name="password" value="1234" />
<property name="password" value="123456" />
<property name="validationQuery" value="SELECT 1" />
<property name="testOnBorrow" value="true"/>
</bean>

View File

@ -0,0 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">
<properties>
<comment>TableFlow</comment>
<entry key="startTableId">1</entry>
<entry key="endTableId">2</entry>
</properties>

View File

@ -15,8 +15,14 @@
<arguments>
</arguments>
</buildCommand>
<buildCommand>
<name>org.springframework.ide.eclipse.core.springbuilder</name>
<arguments>
</arguments>
</buildCommand>
</buildSpec>
<natures>
<nature>org.springframework.ide.eclipse.core.springnature</nature>
<nature>org.eclipse.jdt.core.javanature</nature>
<nature>org.eclipse.m2e.core.maven2Nature</nature>
</natures>

View File

@ -17,7 +17,7 @@ angularjs
angular2
angular
node.js
jqury
jquery
html
spark
nginx

View File

@ -6,7 +6,15 @@ import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.Date;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import javax.annotation.Resource;
import org.apache.log4j.Logger;
@ -16,8 +24,10 @@ import org.springframework.stereotype.Component;
import org.springframework.transaction.annotation.Propagation;
import org.springframework.transaction.annotation.Transactional;
import com.ossean.dao.DBDest;
import com.ossean.dao.DBSource;
import com.ossean.dao.GatherDao;
import com.ossean.dao2.PlatformProjectDao;
import com.ossean.model.GatherProjectsModel;
import com.ossean.util.EDDHandler;
import com.ossean.util.FileReader;
@ -32,12 +42,22 @@ public class GetSynonyms{
@Resource
private DBSource dbSource;
@Resource
public DBDest dbDest;
@Resource
private GatherDao gatherDao;
@Resource
private PlatformProjectDao paltformDao;
private static String gatherProjectsTableName = TableName.gatherProjectsTableName;
private static String gatherProjectsTableName ="dubsynonymsprj_test";//TableName.gatherProjectsTableName;
private static String synonymsTableName = TableName.synonymsTableName;
private static String pointerTableName = TableName.pointerTableName;
private static int batchSize = 1000;
//为了很好的将别名提取的程序结构化特设以下三个全局变量在从项目名称中提取别名时使用
String projectNameWithoutBracket = "";//项目名去除括号
String projectNameWithoutComName = "";//项目名称去除括号和公司基金名字
String bracket = "";//括号中内容
public void start() {
long start_all = System.currentTimeMillis();
@ -63,19 +83,20 @@ public class GetSynonyms{
}
long start = System.currentTimeMillis();
for (GatherProjectsModel model : gpmList) {
Map<String, Integer> synonymsList = getSynonyms(model);
updateTable(synonymsList,model);
// if(synonymsList !=null && synonymsList.size()!=0)
// for (String name : synonymsList.keySet()) {
// dbSource.insertSynonymsNew(synonymsTableName,
// model.getId(), name, synonymsList.get(name));
// }
// // 更新update_mark=1
// gatherDao.updateMark(gatherProjectsTableName, 1, model.getId());
Map<String, Integer> synonymMap = getSynonyms(model);
try
{
//当多个项目具有相同别名的时候选出最适合的那个项目并将其余项目的该别名所对应的flag置0
synonymMap = synonymsFilter(synonymMap,synonymsTableName,model);
}
catch(Exception e)
{
logger.error("prjId:" + model.getId() + " 在处理多个项目有相同别名的时候出现错误:" + e);
}
updateTable(synonymMap,model);
}
// long end = System.currentTimeMillis();
// logger.warn("insert synonyms:deal with "+gpmList.size()+" projects cost: "+(float)(end
// - start)/60000+" minutes");
}
}
@ -89,6 +110,167 @@ public class GetSynonyms{
// 更新update_mark=1
gatherDao.updateMark(gatherProjectsTableName, 1, model.getId());
}
public Map<String,Integer> synonymsFilter(Map<String,Integer> synonymMap,String targetTable,GatherProjectsModel model)
{
Map<String,Integer> copy = synonymMap;
for(String key:copy.keySet())
{
if(copy.get(key) == 1)
{
GatherProjectsModel tmp = model;
//通过别名查找别名表中具有相同别名且flag为1的项目Id
Set<Integer> dupSynonymPrj =getSameSynonymPrj(key,targetTable);
if(dupSynonymPrj != null)
{
Iterator<Integer> iterator = dupSynonymPrj.iterator();
while(iterator.hasNext())
{
GatherProjectsModel prj = dbSource.getGatherProjectById(gatherProjectsTableName ,iterator.next());
//迭代求取最优项目保证tmp始终代表最适合这一别名的项目
tmp = getBetterPrj(tmp,prj);
}
}
if(tmp.getId()==model.getId() && tmp != null)
{
//当前项目为最适合这一别名的项目就更新别名表中其余项目的flag为0
dbDest.updateSynoymmings(targetTable,key,model.getId(),0);
}
else
{
synonymMap.put(key, 0);
dbDest.updateSynoymmings(targetTable, key, tmp.getId(), 0);
}
}
}
return synonymMap;
}
public Set<Integer> getSameSynonymPrj(String key,String targetTable)
{
Set<Integer> dupSynonymPrj = new HashSet<Integer>();
dupSynonymPrj = dbDest.selectSameSynonymPrj(targetTable,key,1);
return dupSynonymPrj;
}
/**
* 比较两个项目哪一个更适合某一别名
* */
private GatherProjectsModel getBetterPrj(GatherProjectsModel model1, GatherProjectsModel model2) {
// TODO Auto-generated method stub
GatherProjectsModel betterPrj=null;
//不同的社区赋给不同的值
int communityValue1 = getCommunityValue(model1);
int communityValue2 = getCommunityValue(model2);
//如果来自同一社区就从抽取数据库中查询其有代表意义的数据比较热度
if(communityValue1 == communityValue2)
{
if(communityValue1 == -1)
return model1;
else
{
if(communityValue1 == 4)
{
/**oschina用star_num具有代表性
**用List是因为抽取数据库中未经汇总有相同项目存在只需求出list取第一个值即可
**/
List<Integer> hot1 = paltformDao.getHotValue("oschina_project","star_num",model1.getUrl_md5());
List<Integer> hot2 = paltformDao.getHotValue("oschina_project","star_num",model2.getUrl_md5());
if(hot1.size()>0 && hot2.size()>0)return hot1.get(0) > hot2.get(0) ? model1:model2;
else
if(hot1.size() == 0) return model2;
else return model1;
}
if(communityValue1 == 3)
{
List<Integer> hot1 = paltformDao.getHotValue("openhub_project","user_num",model1.getUrl_md5());
List<Integer> hot2 = paltformDao.getHotValue("openhub_project","user_num",model2.getUrl_md5());
if(hot1.size()>0 && hot2.size()>0)return hot1.get(0) > hot2.get(0) ? model1:model2;
else
if(hot1.size() == 0) return model2;
else return model1;
}
if(communityValue1 == 2)
{
List<Integer> hot1 = paltformDao.getHotValue("sourceforge_project","download_num",model1.getUrl_md5());
List<Integer> hot2 = paltformDao.getHotValue("sourceforge_project","download_num",model2.getUrl_md5());
if(hot1.size()>0 && hot2.size()>0)return hot1.get(0) > hot2.get(0) ? model1:model2;
else
if(hot1.size() == 0) return model2;
else return model1;
}
if(communityValue1 == 1)
{
List<Integer> hot1 = paltformDao.getHotValue("github","star_num",model1.getUrl_md5());
List<Integer> hot2 = paltformDao.getHotValue("github","star_num",model2.getUrl_md5());
if(hot1.size()>0 && hot2.size()>0)return hot1.get(0) > hot2.get(0) ? model1:model2;
else
if(hot1.size() == 0) return model2;
else return model1;
}
if(communityValue1 == 0)
{
//freecode项目抽取结果中只有创建时间可以利用创建时间较早的认为比较可信
DateFormat df1 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
DateFormat df2 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
List<String> time1 = paltformDao.getTime("gather_projects", "extracted_time", model1.getUrl_md5());
List<String> time2 = paltformDao.getTime("gather_projects", "extracted_time",model2.getUrl_md5());
if(time1!=null && time2!=null)
try {
Date dt1 = df1.parse(time1.get(0));
Date dt2 = df2.parse(time2.get(0));
if(dt1 !=null && dt2!=null)
{
if(dt1.getTime() > dt2.getTime())
return model1;
else
return model2;
}
} catch (ParseException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
else
if(time1 != null)
return model1;
else
return model2;
}
}
}
else
return communityValue1 > communityValue2 ? model1:model2;
return betterPrj;
}
/**
* 根据项目的不同社区赋给其不同的社区热度值
* */
private static int getCommunityValue(GatherProjectsModel model) {
// TODO Auto-generated method stub
String source = model.getSource();
if(source.equals("oschina"))
return 4;
if(source.equals("openhub"))
return 3;
if(source.equals("sourceforge"))
return 2;
if(source.equals("github"))
return 1;
if(source.equals("freecode"))
return 0;
return -1;
}
public static void main(String[] args) {
ApplicationContext applicationContext = new ClassPathXmlApplicationContext(
@ -99,207 +281,517 @@ public class GetSynonyms{
}
public Map<String, Integer> getSynonyms(GatherProjectsModel model) {
if(model.getSource().equals("github"))
{
return getSynonymsFromGithubPrj(model);//处理github项目
}
else
{
return getSynonymsNotGitPrj(model);//处理非github的其他项目
}
}
@SuppressWarnings("finally")
public Map<String, Integer> getSynonymsFromGithubPrj(GatherProjectsModel model)
{
Map<String, Integer> SynonymMap = new HashMap<String, Integer>();// 记录每个别名及其准确度<Sysnonym,1>表时Synonym可靠性很大
List<String> synonymsList = new ArrayList<String>();// 最终返回的同义词列表
List<String> synonymsList_notsure = new ArrayList<String>();
model.setName(model.getName().trim());
if(model.getDescription()!=null && !model.getDescription().equals(""))
if(model.getDescription() != null)
{
model.setDescription(model.getDescription().trim());
}
else
{
model.setDescription("");
}
//记录每个别名及其准确度<Sysnonym,1>表时Synonym可靠性很大
Map<String,Integer> SynonymMap = new HashMap<String,Integer>();
//最终返回的可以确定正确的同义词列表
List<String> synonymsList = new ArrayList<String>();
String description = model.getDescription();
String prjName = model.getName();
try
{
//github项目名称中有斜杠提取斜杠后内容做为别名
String subName = getSubName(prjName);
if(subName != null)
{
SynonymMap.put(subName, 1);
}
//将subName中短横线下划线用空格代替以及将短横线下划线删除后获取的字符串当做别名
String strReplace = ReplaceSpecial(subName);
if(!strReplace.equals(subName))
{
synonymsList.add(strReplace);
}
//删除特殊符号如短横线下划线,如果其包含在description中则做为别名
String strDelete = DeleteSpecial(subName);
if(!strDelete.equals(subName))
{
if(description.toLowerCase().contains(strDelete.toLowerCase()))
{
synonymsList.add(strDelete);
}
}
if(description == null)
{
//如果描述信息为空别名就只包括已经提取出的和从名字中获取的别名
if(synonymsList != null)
for(String sure:synonymsList)
SynonymMap.put(sure, 1);
return SynonymMap;
}
//描述信息非空时尝试从description中提取别名
Map<String, Integer> synonymsFromDesc = getSynonymsFromDescOfGitprj(description,model.getName());
//将别名连同其确定性一起放入SynonymMap中
for(String tmpSyn:synonymsList)
SynonymMap.put(tmpSyn, 1);
//将从描述信息中获取的别名及其确定性放入SynonymMap
if(synonymsFromDesc != null)
for(String key:synonymsFromDesc.keySet())
SynonymMap.put(key, synonymsFromDesc.get(key));
return SynonymMap;
}catch(Exception e)
{
e.printStackTrace();
logger.error("Error while extracting synonyms from project:" + model.getId());
System.exit(0);
}finally
{
return SynonymMap;
}
}
private Map<String, Integer> getSynonymsNotGitPrj(GatherProjectsModel model) {
model.setName(model.getName().trim());
if(model.getDescription() != null)
model.setDescription(model.getDescription().trim());
List<String> synonymsList = new ArrayList<String>();//能够确定正确的别名
List<String> synonymsList_notsure = new ArrayList<String>();//不能百分百确定的别名
String description = model.getDescription();
String prjName = model.getName();
List<String> result = null;
if (description == null) {
if(description == null)
{
//如果description为null则别名无需再做改动直接返回null
return null;
}
List<String> linkNameList = new ArrayList<String>();// 用于存储所有提取出来的关联项目名
List<String> keywords = FileReader.read("./files/language.txt");
// 对项目名进行处理,去除公司名或基金组织的信息&&去除括号没有公司没有括号则包含自己
String projectNameWithoutBracket = StringHandler_ProjectName
.removeBracket(prjName);
String projectNameWithoutComName = StringHandler_ProjectName
.getProjectWithoutComName(projectNameWithoutBracket);
// String projectNameWithoutComName =
// StringHandler_ProjectName.getProjectWithoutComName(prjName);
// projectNameWithoutComName =
// StringHandler_ProjectName.removeBracket(projectNameWithoutComName);
if (!"".equals(projectNameWithoutComName)) {
List<String> linkNameList = new ArrayList<String>();//用于存储所有提取出来的可能是别名的词
List<String> keywords = FileReader.read("./files/language.txt");//用于过滤类似于"Java","C++"这样的词
projectNameWithoutBracket = StringHandler_ProjectName.removeBracket(prjName);
projectNameWithoutComName = StringHandler_ProjectName.getProjectWithoutComName(projectNameWithoutBracket);
//提取项目名括号中的信息不是项目类型
bracket = StringHandler_ProjectName.getBracket(prjName);
//去除括号及括号内容后即可做为别名
if(!"".equals(projectNameWithoutComName))
{
synonymsList.add(projectNameWithoutComName);
// logger.info("gets synonyms:" + projectNameWithoutComName +
// " Without ComName and remove racket");
logger.info("gets synonyms:" + projectNameWithoutComName + " Without ComName and remove racket");
}
// 提取项目名括号中的信息不是项目类型
String bracket = StringHandler_ProjectName.getBracket(prjName);
if (bracket != null && !"".equals(bracket)) {
bracket = RegexHandler.extractEngDecChiAndDot(bracket);// 去除括号中信息的特殊符号
if (!RegexHandler.onlySpecialChar(bracket)) {
if (!keywords.contains(bracket.toLowerCase())) {// 括号中的信息不是语言类关键字词
if (StringHandler.isFullOrShort(bracket,
projectNameWithoutBracket)) {
synonymsList.add(bracket);
} else {
if (StringHandler.specialIsFullOrShort(bracket,
projectNameWithoutBracket))
try
{
//TODO 将对项目名称的处理过程独立为一个函数
getSynonymsFromName(synonymsList,linkNameList,keywords);
if(description != null)
getSynonymsFromDescOfNotGitprj(model,description,prjName,synonymsList,linkNameList,keywords);
//result包含去掉公司信息括号内的be动词之前的
result = removeDupStr(linkNameList);
//TODO 将从linkName中提取全/缩写独立出来
getSynonymsByFullOrShort(result,synonymsList);
//TODO 将从linkName中依据相似度提取别名的程序独立出来
getSynonymsBySimilarDegree(model.getName(),result,synonymsList_notsure,keywords,"notGitPrj");
}catch(Exception e)
{
logger.error("Error while extracting synonym from prj:" + model.getId() + ":" +e);
}
return afterExtract(synonymsList, synonymsList_notsure);
}
public void getSynonymsFromName(List<String> synonymsList,List<String> linkNameList,List<String> keywords)
{
if(bracket != null && projectNameWithoutBracket != null && projectNameWithoutComName != null)
{
// Map<String,Integer> SynonymMap = new HashMap<String,Integer>();
if(!"".equals(bracket))
{
bracket = RegexHandler.extractEngDecChiAndDot(bracket);//去除括号中信息的特殊符号
if(!RegexHandler.onlySpecialChar(bracket))
{
if(!keywords.contains(bracket.toLowerCase()))//括号中的信息不是语言类关键字词
{
//如果括号中内容个项目名存在全/缩写关系
if(StringHandler.isFullOrShort(bracket, projectNameWithoutBracket))
{
synonymsList.add(bracket);
//SynonymMap.put(bracket, 1);
}
else
linkNameList.add(bracket);
{
if(StringHandler.specialIsFullOrShort(bracket, projectNameWithoutBracket))
synonymsList.add(bracket);
else linkNameList.add(bracket);
}
}
}
}
}
if (description != null) {
List<String> beWords = FileReader.read("./files/beVerb.txt");// 读取be动词表
description = StringHandler.getFirstSentence(description);
for (String beWord : beWords) {
String linkName = StringHandler.findLinkName(description,
beWord);// 按系动词表优先级进行匹配
if (linkName == null) {// 表示没有匹配到当前的系动词 进行下一个匹配
continue;
}
if (linkName.toLowerCase().equals(prjName.toLowerCase()))
continue;
// 特征短语去噪
List<String> linkNameResult = EDDHandler
.getMainAndViceDescriptionByBracket(linkName);// 提取主副描述信息(括号中)
linkNameResult = EDDHandler
.getMainAndViceDescriptionByKnowAs(linkNameResult);// 提取主副描述信息(known
// as+连词)
linkNameResult = EDDHandler.removeComma(linkNameResult);// 去除定语描述信息(逗号定语)
linkNameResult = EDDHandler
.removeDemonstrativeWords(linkNameResult);// 去除定语描述信息(定冠词The)
linkNameResult = EDDHandler
.removeArrtibutiveClause(linkNameResult);// 去除从句描述信息
linkNameResult = EDDHandler.removePronoun(linkNameResult);// 去除指示代词
linkNameResult = EDDHandler.removeProject(linkNameResult);// 去除项目常用词
linkNameResult = EDDHandler.removePPCombine(linkNameResult);// 去除指示代词和项目常用词两者的笛卡尔积
linkNameResult = RegexHandler
.extractEngDecChiAndDot(linkNameResult);// 提取项目中
linkNameResult = RegexHandler.removeOnlySpecial(linkNameResult);// 删除只有特殊字符的同义词
for (String name : linkNameResult) {
// logger.info("be 动词之前的有: "+name);
// be动词前不包括项目类型且和项目名不相同
if (!keywords.contains(name.toLowerCase().trim())
&& !name.toLowerCase().trim()
.equals(prjName.toLowerCase().trim())
&& !name.toLowerCase()
.trim()
.equals(projectNameWithoutComName
.toLowerCase().trim())) {
// 这里的prjName是完整的name
if (prjName.toLowerCase().indexOf(name.toLowerCase()) != -1) {
String withoutShortDash = RemoveShortDash(name);
if (bracket == null || bracket.equals(""))
continue;
if (name.toLowerCase()
.equals(bracket.toLowerCase())
|| name.toLowerCase().equals(
withoutShortDash))
synonymsList.add(name);
else {
int nameLength1 = withoutShortDash.split(" ").length;
int nameLength2 = projectNameWithoutBracket
.split(" ").length;
int nameLength = name.trim().split(" ").length;
double ratio1 = (nameLength1 / nameLength) * 1.0;
double ratio2 = (nameLength2 / nameLength) * 1.0;
if (nameLength1 > 1 && ratio1 > 0.6)
synonymsList.add(name);
else {
if (nameLength2 > 1 && ratio2 > 0.6)
synonymsList.add(name);
else
linkNameList.add(name);
}
}
// synonymsList.add(name);//问题出在这个地方如果从discription中提取出的词在名字中有出现就直接加入
// 会导致比如Ajax被识别为Ajax Implement的别名而实际上并不是这样的
// logger.info("gets synonyms:" + name +
// " from description before be verb");
} else
linkNameList.add(name);
}
}
}
}
// result包含去掉公司信息括号内的be动词之前的
result = removeDupStr(linkNameList);
// logger.info("result去掉公司信息、括号内的、be动词之前的;");
// for (int i = 0; i < result.size(); i++)
// System.out.println("result: " + result.get(i));
// 去除重复的
String fullName = StringHandler.getFullName(
projectNameWithoutComName.toLowerCase(), result);
if (!fullName.equals("")) {
synonymsList.add(fullName.toLowerCase());// 表示项目名确实是缩写 并提取到了项目全名
// logger.info("项目的全称为: "+fullName);
}
String shortName = StringHandler.getShortName(
projectNameWithoutComName.toLowerCase(), result);
if (!shortName.equals("")) {
synonymsList.add(shortName.toLowerCase());// 表示项目名确实是全称 并提取到了项目缩写
// logger.info("项目的简称为: "+shortName);
}
// 同义词要和项目名称有一定的相似度
for (int i = 0; i < result.size(); i++) {
}
public void getSynonymsBySimilarDegree(String Name,List<String> result,List<String> synonymsList_notsure,List<String> keywords,String prjSource)
{
//同义词要和项目名称有一定的相似度
for(int i = 0; i < result.size(); i++){
String extract = "";
String synonym = result.get(i);
String[] words = synonym.split(" ");// 按照空格进行分词
for (String word : words) {
if (model.getName().toLowerCase().indexOf(word.toLowerCase()) >= 0) {// 查看每个词在项目名中是否出现
String[] words = synonym.split(" ");//按照空格进行分词
for(String word:words){
if(Name.toLowerCase().indexOf(word.toLowerCase()) >= 0){//查看每个词在项目名中是否出现
extract += word + " ";
}
}
if (extract.length() != 0)
if(extract.length() !=0)
extract = extract.substring(0, extract.length() - 1);
// logger.info("extract : "+extract);
if (!"".equals(extract) && !keywords.contains(extract)) {
// 提取的别名不和项目名重复
if (!extract.toLowerCase().equals(prjName.toLowerCase().trim())
&& !extract
.toLowerCase()
.trim()
.equals(projectNameWithoutComName.toLowerCase()
.trim())) {
double similarDegree = SimilarDegree(extract,
projectNameWithoutComName);
// System.out.println("字符串间的相似度为: "+similarDegree);
if (similarDegree >= 0.6 && similarDegree < 1.0) {
// logger.info("from extract after similar "+extract);
//logger.info("extract : "+extract);
if(!"".equals(extract) && !keywords.contains(extract))
{
boolean AbleToCalcSimilar = false;
if(prjSource.equals("github"))
{
String strReplace = ReplaceSpecial(getSubName(Name));
if(!extract.toLowerCase().equals(Name.toLowerCase().trim()) && !extract.toLowerCase().trim().equals(strReplace.toLowerCase().trim()))
AbleToCalcSimilar = true;
}
if(!prjSource.equals("notGitPrj"))
{
if(Name!= null && projectNameWithoutBracket != null)
if(!extract.toLowerCase().equals(Name.toLowerCase().trim()) && !extract.toLowerCase().trim().equals(projectNameWithoutComName.toLowerCase().trim()))
AbleToCalcSimilar = true;
}
//提取的别名不和项目名重复
if(AbleToCalcSimilar)
{
double similarDegree = 0.0;
if(!prjSource.equals("github")) similarDegree= SimilarDegree(extract,projectNameWithoutComName);
if(prjSource.equals("github")) similarDegree = SimilarDegree(extract, Name);
if(similarDegree >= 0.6 && similarDegree <1.0)
{
logger.info("from extract after similar "+extract);
synonymsList_notsure.add(extract);
}
}
}
}
removeDupStr(synonymsList);// 去除同名列表中重复的
removeDupStr(synonymsList_notsure);
for (String s : synonymsList) {
// logger.info("finally synonyms:" + s + " by description");
}
public void getSynonymsByFullOrShort(List<String> result,List<String>synonymsList)
{
String fullName = StringHandler.getFullName(projectNameWithoutComName.toLowerCase(), result);
if(!fullName.equals("")){
synonymsList.add(fullName.toLowerCase());//表示项目名确实是缩写 并提取到了项目全名
}
String shortName = StringHandler.getShortName(projectNameWithoutComName.toLowerCase(), result);
if(!shortName.equals("")){
synonymsList.add(shortName.toLowerCase());//表示项目名确实是全称 并提取到了项目缩写
}
}
public void getSynonymsFromDescOfNotGitprj(GatherProjectsModel model,String description,String prjName,List<String> synonymsList,List<String> linkNameList,List<String> keywords)
{
for (String sure : synonymsList)
SynonymMap.put(sure, 1);
for (String Notsure : synonymsList_notsure)
SynonymMap.put(Notsure, 0);
return SynonymMap;
List<String> beWords = FileReader.read("./files/beVerb.txt");//读取be动词表
//TODO 将对description的分词处理部分独立出来
description = StringHandler.getFirstSentence(description);
for(String beWord:beWords){
String linkName = StringHandler.findLinkName(description, beWord);//按系动词表优先级进行匹配
if(linkName == null){//表示没有匹配到当前的系动词 进行下一个匹配
continue;
}
if(linkName.toLowerCase().equals(prjName.toLowerCase()))
continue;
//特征短语去噪
List<String> linkNameResult =getLinkNameResult(linkName);
handleLinkNameResult(model,linkNameResult,synonymsList,linkNameList,keywords);
}
}
public Map<String, Integer> getSynonymsFromDescOfGitprj(String description,String prjName)
{
List<String> keywords = FileReader.read("./files/language.txt");
List<String> beWords = FileReader.read("./files/beVerb.txt");
//程序中只使用description的第一句进行处理
String descriptionOfFirstSentence = StringHandler.getFirstSentence(description);
//获取斜杠后名字
String subName = getSubName(prjName);
//将subName中的特殊符号用空格代替
String strReplace = ReplaceSpecial(subName);
//将subName中的特殊符号直接删除
String strDelete = DeleteSpecial(subName);
List<String> linkNameList = new ArrayList<String>();
List<String> synonymsList = new ArrayList<String>();//确定正确的别名
List<String> synonymsList_not_sure = new ArrayList<String>();//不确定是否正确但是满足提取规则的别名
// int matchBewords = 0;//指示description中是否有be动词为0则没有匹配到任何be动词大于1则说明有匹配到be动词
List<String> linkNameResult = new ArrayList<String>();//存放description分词后的处理结果
int length = beWords.size();
for(int k=0;k<length;k++)
{
//按系动词表优先级进行匹配
String linkName = StringHandler.findLinkName(descriptionOfFirstSentence, beWords.get(k));
if(linkName == null)
{
if(k < length-1)
continue;
else
{
if(description.length()<100)
linkName = description;
else linkName = description.substring(0, 100);
}
}
//如果提取出的和subName相等就不再进行接下来的判断这在github中普遍存在
if(linkName.toLowerCase().equals(subName.toLowerCase()))
continue;
//对匹配到be动词的部分进行分词处理
linkNameResult = getLinkNameResult(linkName);
if(linkNameResult != null)
{
//将从description中提取出的各个单词组合成长字符串
String strFromDesc="";
for(String name:linkNameResult)
{
logger.info("be 动词之前的有: "+name);
if(!keywords.contains(name.toLowerCase().trim()) && !name.toLowerCase().trim().equals(subName.toLowerCase().trim()) )
{
if(DeleteSpecial(name).equals(strDelete) && DeleteSpecial(name)!= null)
{
synonymsList.add(name);
}
else
{
//提取全缩写形式的别名
String synonymsFromGitPrj = getSynonymByFullOrShortOfGitPrj(strReplace.toLowerCase(),strDelete.toLowerCase(),name.toLowerCase());
if(synonymsFromGitPrj != null)
{
synonymsList.add(synonymsFromGitPrj);
}
else
{
linkNameList.add(name.toLowerCase());
}
}
}
//将经过处理的description再组装成字符串
strFromDesc = strFromDesc + name + " ";
}
strFromDesc = strFromDesc.trim();
}
}
List<String>result = removeDupStr(linkNameList);
getSynonymsBySimilarDegree(subName, result, synonymsList_not_sure, keywords,"github");
return afterExtract(synonymsList,synonymsList_not_sure);
}
public List<String> getLinkNameResult(String linkName)
{
//特征短语去噪
List<String> linkNameResult = EDDHandler.getMainAndViceDescriptionByBracket(linkName);//提取主副描述信息(括号中)
linkNameResult = EDDHandler.getMainAndViceDescriptionByKnowAs(linkNameResult);//提取主副描述信息(known as+连词)
linkNameResult = EDDHandler.removeComma(linkNameResult);//去除定语描述信息(逗号定语)
linkNameResult = EDDHandler.removeDemonstrativeWords(linkNameResult);//去除定语描述信息(定冠词The)
linkNameResult = EDDHandler.removeArrtibutiveClause(linkNameResult);//去除从句描述信息
linkNameResult = EDDHandler.removePronoun(linkNameResult);//去除指示代词
linkNameResult = EDDHandler.removeProject(linkNameResult);//去除项目常用词
linkNameResult = EDDHandler.removePPCombine(linkNameResult);//去除指示代词和项目常用词两者的笛卡尔积
linkNameResult = RegexHandler.extractEngDecChiAndDot(linkNameResult);//提取项目中
linkNameResult = RegexHandler.removeOnlySpecial(linkNameResult);//删除只有特殊字符的同义词
return linkNameResult;
}
public void handleLinkNameResult(GatherProjectsModel model,List<String> linkNameResult,List<String> synonymsList,List<String>linkNameList,List<String>keywords)
{
for(String name:linkNameResult){
logger.info("be 动词之前的有: "+name);
//be动词前不包括项目类型且和项目名不相同
if(!keywords.contains(name.toLowerCase().trim()) && !name.toLowerCase().trim().equals(model.getName().toLowerCase().trim())
&& !name.toLowerCase().trim().equals(projectNameWithoutComName.toLowerCase().trim())){
//这里的prjName是完整的name
if(model.getName().toLowerCase().indexOf(name.toLowerCase()) != -1){
String withoutShortDash = RemoveShortDash(name);
if(name.toLowerCase().equals(bracket.toLowerCase()) || name.toLowerCase().equals(withoutShortDash))
synonymsList.add(name);
else
{
int nameLength1 = withoutShortDash.split(" ").length;
int nameLength2 = projectNameWithoutBracket.split(" ").length;
int nameLength = name.trim().split(" ").length;
double ratio1= (nameLength1/nameLength) * 1.0;
double ratio2 =(nameLength2/nameLength) * 1.0;
if(nameLength1 > 1 && ratio1 > 0.6)
synonymsList.add(name);
else
{
if(nameLength2 > 1 && ratio2 > 0.6)
synonymsList.add(name);
else linkNameList.add(name);
}
}
}
else
linkNameList.add(name);
}
}
}
//取斜杠后面的字符串如果没有斜杠就返回字符串本身
private String getSubName(String prjName) {
// TODO Auto-generated method stub
String SubName = prjName;
if(prjName.indexOf('/') == -1)
return SubName;
else
SubName = prjName.substring(prjName.indexOf('/'));
return SubName;
}
public static String DeleteSpecial(String subName) {
// TODO Auto-generated method stub
int deleteNum=0;
StringBuffer str=new StringBuffer(subName);
for(int i=0;i< subName.length();i++)
{
//去除下划线
if(subName.charAt(i) == '_')
{
str.deleteCharAt(i-deleteNum);
deleteNum++;
}
//去除短横线
if(subName.charAt(i) == '-')
{
//System.out.println("字符位置在" + i + " and 字符是" + subName.charAt(i));
str.deleteCharAt(i-deleteNum);
deleteNum++;
}
if(subName.charAt(i) == '.')
{
//System.out.println("字符位置在" + i + " and 字符是" + subName.charAt(i));
str.deleteCharAt(i-deleteNum);
deleteNum++;
}
if(subName.charAt(i)== '/')
{
//System.out.println("字符位置在" + i + " and 字符是" + subName.charAt(i));
str.deleteCharAt(i-deleteNum);
deleteNum++;
}
}
return str.toString();
}
public static String ReplaceSpecial(String subName) {
// TODO Auto-generated method stub
StringBuffer str=new StringBuffer(subName);
for(int i=0;i<subName.length();i++)
{
if(subName.charAt(i)== '-')
str.setCharAt(i, ' ');
if(subName.charAt(i) == '_')
str.setCharAt(i, ' ');
}
return str.toString();
}
//将提取出的一定正确的别名及不确定是否正确的别名和其精确度对应起来组装到Map中
public Map<String,Integer>afterExtract(List<String> synonymsList,List<String>synonymsList_not_sure)
{
Map<String,Integer> SynonymMap = new HashMap<String,Integer>();
if(synonymsList != null)
{
synonymsList = removeDupStr(synonymsList);
if(synonymsList != null)
for(String sure:synonymsList)
{
SynonymMap.put(sure, 1);
}
}
if(synonymsList_not_sure != null)
{
synonymsList_not_sure = removeDupStr(synonymsList_not_sure);
for(String Notsure:synonymsList_not_sure)
{
SynonymMap.put(Notsure, 0);
}
}
return SynonymMap;
}
public String getSynonymByFullOrShortOfGitPrj(String subName,String subName2,String descWithoutBracketHandled)
{
if(StringHandler.isFullOrShort(ReplaceSpecial(descWithoutBracketHandled).toLowerCase(), subName.toLowerCase()) ||
StringHandler.specialIsFullOrShortForGitPrj(ReplaceSpecial(descWithoutBracketHandled).toLowerCase(), subName.toLowerCase()))
{
return descWithoutBracketHandled;
//System.out.println("ReplaceSpecial提取到别名" + descWithoutBracketHandled);
}
// if(StringHandler.isFullOrShort(DeleteSpecial(descWithoutBracketHandled).toLowerCase(), subName2.toLowerCase()) ||
// StringHandler.specialIsFullOrShortForGitPrj(DeleteSpecial(descWithoutBracketHandled).toLowerCase(), subName2.toLowerCase()))
// {
// return descWithoutBracketHandled;
// //System.out.println("DeleteSpecial提取到别名" + descWithoutBracketHandled);
// }
return null;
}
public static String RemoveShortDash(String name) {
String withoutShortDash = "";
if (name.indexOf('-') > -1)

View File

@ -6,6 +6,7 @@ import org.apache.ibatis.annotations.Delete;
import org.apache.ibatis.annotations.Insert;
import org.apache.ibatis.annotations.Param;
import org.apache.ibatis.annotations.Select;
import org.apache.ibatis.annotations.Update;
import com.ossean.model.OpenSourceProject;
import com.ossean.model.Taggings;
@ -13,6 +14,12 @@ import com.ossean.model.Taggings;
public interface DBDest {
//从别名表中找出和synonym相同的别名对应的ID
@Select("select distinct(prjId) from ${targetTable} where synonyms=#{Synonym} and flag = #{flag}")
public Set<Integer> selectSameSynonymPrj(@Param("targetTable")String targetTable, @Param("Synonym")String Synonym,@Param("flag") int flag);
//根据别名和项目Id更新别名表中的flag
@Update("update ${targetTable} set flag = #{flag} where synonyms = #{synonym} and prjId != #{prjId}")
public void updateSynoymmings(@Param("targetTable") String targetTable,@Param("synonym") String synonym,@Param("prjId") int prjId,@Param("flag")int flag);
//删除open_source_projects表中对应id的数据
@Delete("delete from ${table} where id=#{id}")

View File

@ -19,6 +19,10 @@ import com.ossean.model.Tags;
import com.ossean.model.UpdateControlProjects;
public interface DBSource {
//读取gather_projects表中的数据 根据id in来读取
@Select("select * from ${table} where id=#{id}")
public GatherProjectsModel getGatherProjectById(@Param("table") String table, @Param("id") int id);
//查看指针edd_pointers表查看处理数量
@Select("select Pointer from ${table} where SourceTableName=#{SourceTableName} and TargetTableName=#{TargetTableName}")
public int getPointer(@Param("table") String table, @Param("SourceTableName") String SourceTableName, @Param("TargetTableName") String TargetTableName);

View File

@ -1,5 +1,7 @@
package com.ossean.dao2;
import java.util.List;
import org.apache.ibatis.annotations.Param;
import org.apache.ibatis.annotations.Select;
@ -22,4 +24,10 @@ public interface PlatformProjectDao {
@Select("select id,short_name,name,stargazers_count,forks from github where url = #{url} group by url_md5 order by extracted_time desc")
public GithubProject getGithubPrjByUrl(@Param("url") String url);
//获取创建时间以比较项目的可信度
@Select("select created_time from ${table} where url_md5=#{url_md5}")
public List<String> getTime(@Param("table") String table, @Param("created_time") String created_time,@Param("url_md5") String url_md5);
//获取star_num,download_num等判断各项目热度
@Select("select ${star_num} from ${table} where url_md5=#{url_md5}")
public List<Integer> getHotValue(@Param("table") String table, @Param("star_num") String star_num,@Param("url_md5") String url_md5);
}

View File

@ -50,7 +50,6 @@ public class StringHandler {
}
}
resultStrArr = tmp.toArray(new String[0]);
//System.out.println("bangbangbang"+resultStrArr[0]+" "+resultStrArr[1]);
return resultStrArr;
}
@ -289,13 +288,18 @@ public class StringHandler {
}
public static boolean getFullName(String shortName, String synonym,int a){
//查看当前的同义词是不是含不含有全称
String[] words = synonym.toLowerCase().split(" ");
String shortenWord = "";
for(int i = 0; i < words.length; i++){
String word = words[i];
String tmp = word.substring(0, 1);//取出第一个英文字符
String tmp = "";
if(word.length() == 1)
tmp = word;
else
if(word.length() > 0)
tmp = word.substring(0, 1);//取出第一个英文字符
shortenWord += tmp;
}
@ -358,26 +362,22 @@ public class StringHandler {
public static boolean specialIsFullOrShort(String shortName, String fullName)
{
//System.out.println("进入了特殊别名提取函数");
//首先获取两个字符串中单词的
String[] words_fullName= fullName.toLowerCase().split(" ");
String[] words_shortName = shortName.toLowerCase().split(" ");
if(words_fullName.length==1 && words_shortName.length==1)
{
//System.out.println("两者长度都大于1");
return false;
}
if(words_shortName.length == 1 && words_fullName.length > 1 )
{
//System.out.println("fullName的长度大于1");
if(specialIsFullOrShorDetail(shortName,fullName))
return true;
}
else
{
//System.out.println("shortName长度大于1");
if(specialIsFullOrShorDetail(fullName,shortName))
return true;
}
@ -397,10 +397,15 @@ public class StringHandler {
//要求全写中各个单词的首字母必须在缩写中出现而且是顺序出现或最多有一个不出现在缩写中
for(int i = 0; i < words_fullName.length; i++){
String word = words_fullName[i].toLowerCase();
String tmp = word.substring(0, 1);//取出第一个英文字符
String tmp = "";//取出第一个英文字符
if(word.length() == 1)
tmp = word;
else
if(word.length() > 0)
tmp = word.substring(0, 1);//取出第一个英文字符
if(shortName.toLowerCase().contains(tmp))
{
//System.out.println("进入了详细字符串比对函数");
// tmpFlag = shortName.indexOf(tmp);
// if(tmpFlag > positionFlag)
// {
@ -412,11 +417,10 @@ public class StringHandler {
/**
如果满足全写中各个单词的首字母都在缩写中顺序出现就接着
判断缩写中所有字幕是否都在写中出现
判断缩写中所有字幕是否都在写中出现
*/
if(matchCount > words_fullName.length-2)
{
//System.out.println("匹配个数与全写中单词个数相等");
if(isAllContained(shortName,fullName))
return true;
}
@ -540,4 +544,50 @@ public class StringHandler {
return shortName;//表示描述信息中出现了缩写形式
return "";
}
public static boolean specialIsFullOrShortForGitPrj(String shortName, String fullName)
{
String[] splitFullName = null;
int containedNum = 0;//记录shortName中包含在fullName中的单词的个数
if(fullName.indexOf("_") > -1)
splitFullName = fullName.split("-");
if(fullName.indexOf("-") > -1)
splitFullName = fullName.split("_");
else
splitFullName = fullName.split(" ");
if(splitFullName != null)
for(String tmp:splitFullName)
if(shortName.contains(tmp))
containedNum++;
if(containedNum > 2)
return false;
if(fullName.contains(shortName) || shortName.contains(fullName))
return false;
//首先获取两个字符串中单词的
String[] words_fullName= fullName.toLowerCase().split(" ");
String[] words_shortName = shortName.toLowerCase().split(" ");
if(words_fullName.length==1 && words_shortName.length==1)
return false;
if(words_shortName.length == 1 && words_fullName.length > 1 )
{
if(specialIsFullOrShorDetail(shortName,fullName))
{
return true;
}
}
else
{
if(specialIsFullOrShorDetail(fullName,shortName))
{
return true;
}
}
return false;
}
}