修改别名提取为多线程

This commit is contained in:
kevin 2017-03-22 17:09:30 +08:00
parent 649a6363a7
commit 187a25fe4d
6 changed files with 1100 additions and 1 deletions

View File

@ -52,7 +52,7 @@
<property name="sqlSessionFactory" ref="sqlSessionFactoryTwo"></property>
</bean>
<tx:annotation-driven transaction-manager="transactionManager"/>
<tx:annotation-driven transaction-manager="transactionManager" proxy-target-class="true"/>
<bean id="transactionManager" class="org.springframework.jdbc.datasource.DataSourceTransactionManager">
<property name="dataSource" ref="dataSourceOne" />
</bean>

View File

@ -0,0 +1,16 @@
#!/bin/bash
find ./target/classes -name "*.properties"|xargs rm -f
find ./target/classes -name "*.xml"|xargs rm -f
find ./target/classes -name "*.dic"|xargs rm -f
#export CLASSPATH=$CURR_DIR/lib:$CURR_DIR:$JAVA_HOME/lib:$JAVA_HOME/jre/lib
tmp='./target/classes':$tmp
tmp='./target/Project_Match-0.0.1-SNAPSHOT-jar-with-dependencies-without-resources/*':$tmp
tmp='./bin/resources':$tmp
CLASSPATH=$tmp:$CLASSPATH
echo $CLASSPATH
java -classpath $CLASSPATH com.ossean.GetSynonymsProcess >>log/getsynonyms_2017.log 2>&1 &

View File

@ -0,0 +1,9 @@
package com.ossean;
import org.springframework.context.ApplicationContext;
import org.springframework.context.support.ClassPathXmlApplicationContext;
public class AppContext {
static ApplicationContext appContext = new ClassPathXmlApplicationContext(
"classpath:/applicationContext*.xml");
}

View File

@ -0,0 +1,82 @@
package com.ossean;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import javax.annotation.Resource;
import org.apache.log4j.Logger;
import org.springframework.context.ApplicationContext;
import org.springframework.context.support.ClassPathXmlApplicationContext;
import org.springframework.stereotype.Component;
import org.springframework.stereotype.Component;
import com.ossean.dao.DBDest;
import com.ossean.dao.DBSource;
import com.ossean.dao.GatherDao;
import com.ossean.dao2.PlatformProjectDao;
@Component
public class GetSynonymsProcess {
Logger logger = Logger.getLogger(this.getClass());
@Resource
private DBSource dbSource;
@Resource
public DBDest dbDest;
@Resource
private GatherDao gatherDao;
@Resource
private PlatformProjectDao paltformDao;
//创建进程池一次20个进程
private ExecutorService pool = Executors.newFixedThreadPool(20);
private static String gatherProjectsTableName ="gather_projects_test";//TableName.gatherProjectsTableName;
private static String synonymsTableName = TableName.synonymsTableName;
private static String pointerTableName = TableName.pointerTableName;
private static int batchSize = 10000;
public void start() throws InterruptedException
{
int lastId = gatherDao.selectLastId(gatherProjectsTableName);
int maxId = gatherDao.getMaxId(gatherProjectsTableName);
System.out.println("lastId:" + lastId + ",maxId:" + maxId);
Thread.sleep(1000);
int begin = lastId;
int end;
while(begin < maxId)
{
end = begin + batchSize -1;
if(end < maxId)
{
GetSynonymsThread getSynonymsThread = (GetSynonymsThread)AppContext.appContext.getBean("GetSynonymsThread");
getSynonymsThread.setBorder(begin, end);
getSynonymsThread.setSourceTable(gatherProjectsTableName);
//getSynonymsThread.run();
pool.execute(getSynonymsThread);
begin = begin + batchSize;
}
else
{
end = maxId;
GetSynonymsThread getSynonymsThread = (GetSynonymsThread)AppContext.appContext.getBean("GetSynonymsThread");
getSynonymsThread.setBorder(begin, end);
getSynonymsThread.setSourceTable(gatherProjectsTableName);
pool.execute(getSynonymsThread);
//getSynonymsThread.run();
break;
}
}
}
public static void main(String[] args) throws InterruptedException {
// TODO Auto-generated method stub
ApplicationContext applicationContext = new ClassPathXmlApplicationContext(
"classpath:/applicationContext*.xml");
GetSynonymsProcess Main = applicationContext.getBean(GetSynonymsProcess.class);
Main.start();
}
}

View File

@ -0,0 +1,986 @@
package com.ossean;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import javax.annotation.Resource;
import org.apache.log4j.Logger;
import org.springframework.context.ApplicationContext;
import org.springframework.context.annotation.Scope;
import org.springframework.context.support.ClassPathXmlApplicationContext;
import org.springframework.stereotype.Component;
import org.springframework.transaction.annotation.Propagation;
import org.springframework.transaction.annotation.Transactional;
import com.ossean.dao.DBDest;
import com.ossean.dao.DBSource;
import com.ossean.dao.GatherDao;
import com.ossean.dao2.PlatformProjectDao;
import com.ossean.model.GatherProjectsModel;
import com.ossean.util.EDDHandler;
import com.ossean.util.FileReader;
import com.ossean.util.RegexHandler;
import com.ossean.util.StringHandler;
import com.ossean.util.StringHandler_ProjectName;
@Component("GetSynonymsThread")
@Scope("prototype")
public class GetSynonymsThread implements Runnable {
Logger logger = Logger.getLogger(this.getClass());
@Resource
private DBSource dbSource;
@Resource
public DBDest dbDest;
@Resource
private GatherDao gatherDao;
@Resource
private PlatformProjectDao paltformDao;
private static String gatherProjectsTableName ="gather_031922";//TableName.gatherProjectsTableName;
private static String synonymsTableName = TableName.synonymsTableName;
private static String pointerTableName = TableName.pointerTableName;
private static int batchSize = 1000;
//为了很好的将别名提取的程序结构化特设以下三个全局变量在从项目名称中提取别名时使用
String projectNameWithoutBracket = "";//项目名去除括号
String projectNameWithoutComName = "";//项目名称去除括号和公司基金名字
String bracket = "";//括号中内容
private int ids_begin;
private int ids_end;
private String sourceTable;
public void setBorder(int ids_begin,int ids_end)
{
this.ids_begin = ids_begin;
this.ids_end = ids_end;
}
public void setSourceTable(String sourceTable)
{
this.sourceTable = sourceTable;
}
int beginId;
int endId;
int idIncrement = 1000;
@Override
public void run()
{
System.out.println("beginId:" + this.ids_begin + ",endId:" + this.ids_end);
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
logger.info("进程:" + sourceTable + ":" + this.ids_begin + "to:" + this.ids_end + "开始运行!");
while(ids_begin < ids_end)
{System.out.println("*************************************************************");
beginId = ids_begin;
endId = beginId + idIncrement -1;
if(endId <= ids_end)
{
batchGetSynonyms(beginId,endId);
ids_begin = ids_begin + idIncrement;
}
else
{
endId = ids_end;
batchGetSynonyms(beginId,endId);
break;
}
}
}
private void batchGetSynonyms(int beginId2, int endId2) {
// TODO Auto-generated method stub
List<GatherProjectsModel> gpmList = gatherDao.selectbatchPrj(beginId,endId,sourceTable);
for(GatherProjectsModel model : gpmList)
{
Map<String, Integer> synonymMap = getSynonyms(model);
synonymMap = synonymsFilter(synonymMap,synonymsTableName,model);
updateTable(synonymMap,model);
}
}
@Transactional(propagation=Propagation.REQUIRED)
public void updateTable(Map<String, Integer> synonymsList,GatherProjectsModel model){
if(synonymsList !=null && synonymsList.size()!=0)
for (String name : synonymsList.keySet()) {
dbSource.insertSynonymsNew(synonymsTableName,
model.getId(), name, synonymsList.get(name));
}
// 更新update_mark=1
gatherDao.updateMark(gatherProjectsTableName, 1, model.getId());
}
public Map<String,Integer> synonymsFilter(Map<String,Integer> synonymMap,String targetTable,GatherProjectsModel model)
{
try
{
Map<String,Integer> copy = synonymMap;
if(copy != null)
for(String key:copy.keySet())
{
if(copy.get(key) == 1)
{
GatherProjectsModel tmp = model;
//通过别名查找别名表中具有相同别名且flag为1的项目Id
Set<Integer> dupSynonymPrj =getSameSynonymPrj(key,targetTable);
//GitHub社区项目中存在A-B-CA B C这种需要把短横下划线去掉后再次判别是否相同
String replSpecial = ReplaceSpecial(key);
if(!replSpecial.equals(key))
{
Set<Integer> dupSynonymsPrj2 = getSameSynonymPrj(replSpecial, targetTable);
if(dupSynonymsPrj2 != null)
{
if(dupSynonymPrj != null && dupSynonymsPrj2 != null)
dupSynonymPrj.addAll(dupSynonymsPrj2);
else
dupSynonymPrj = dupSynonymsPrj2;
}
}
if(dupSynonymPrj != null)
{
Iterator<Integer> iterator = dupSynonymPrj.iterator();
while(iterator.hasNext())
{
GatherProjectsModel prj = dbSource.getGatherProjectById(gatherProjectsTableName ,iterator.next());
//迭代求取最优项目保证tmp始终代表最适合这一别名的项目
if(prj != null)
tmp = getBetterPrj(tmp,prj);
}
}
if(tmp.getId()==model.getId() && tmp != null)
{
//当前项目为最适合这一别名的项目就更新别名表中其余项目的flag为0
dbDest.updateSynoymmings(targetTable,key,model.getId(),0);
}
else
{
synonymMap.put(key, 0);
dbDest.updateSynoymmings(targetTable, key, tmp.getId(), 0);
}
}
}
return synonymMap;
}
catch(Exception e)
{
e.printStackTrace();
return null;
}
}
public Set<Integer> getSameSynonymPrj(String key,String targetTable)
{
try
{
Set<Integer> dupSynonymPrj = new HashSet<Integer>();
dupSynonymPrj = dbDest.selectSameSynonymPrj(targetTable,key,1);
return dupSynonymPrj;
}catch(Exception e)
{
e.printStackTrace();
return null;
}
}
/**
* 比较两个项目哪一个更适合某一别名
* */
private GatherProjectsModel getBetterPrj(GatherProjectsModel model1, GatherProjectsModel model2) {
try
{
// TODO Auto-generated method stub
GatherProjectsModel betterPrj=null;
//不同的社区赋给不同的值
int communityValue1 = getCommunityValue(model1);
int communityValue2 = getCommunityValue(model2);
//如果来自同一社区就从抽取数据库中查询其有代表意义的数据比较热度
if(communityValue1 == communityValue2)
{
if(communityValue1 == -1)
return model1;
else
{
if(communityValue1 == 4)
{
/**oschina用star_num具有代表性
**用List是因为抽取数据库中未经汇总有相同项目存在只需求出list取第一个值即可
**/
List<Integer> hot1 = paltformDao.getHotValue("oschina_project","star_num",model1.getUrl_md5());
List<Integer> hot2 = paltformDao.getHotValue("oschina_project","star_num",model2.getUrl_md5());
if(hot1.size()>0 && hot2.size()>0)return hot1.get(0) > hot2.get(0) ? model1:model2;
else
if(hot1.size() == 0)
return model2;
else
return model1;
}
if(communityValue1 == 3)
{
List<Integer> hot1 = paltformDao.getHotValue("openhub_project","user_num",model1.getUrl_md5());
List<Integer> hot2 = paltformDao.getHotValue("openhub_project","user_num",model2.getUrl_md5());
if(hot1.size()>0 && hot2.size()>0)return hot1.get(0) > hot2.get(0) ? model1:model2;
else
if(hot1.size() == 0)
return model2;
else
return model1;
}
if(communityValue1 == 2)
{
List<Integer> hot1 = paltformDao.getHotValue("sourceforge_project","download_num",model1.getUrl_md5());
List<Integer> hot2 = paltformDao.getHotValue("sourceforge_project","download_num",model2.getUrl_md5());
if(hot1.size()>0 && hot2.size()>0)
return hot1.get(0) > hot2.get(0) ? model1:model2;
else
if(hot1.size() == 0)
return model2;
else
return model1;
}
if(communityValue1 == 1)
{
List<Integer> hot1 = paltformDao.getHotValue("github_tmp","forks",model1.getUrl_md5());
List<Integer> hot2 = paltformDao.getHotValue("github_tmp","forks",model2.getUrl_md5());
if(hot1.size()>0 && hot2.size()>0)
return hot1.get(0) > hot2.get(0) ? model1:model2;
else
if(hot1== null || hot1.size()==0)
return model2;
else
return model1;
}
if(communityValue1 == 0)
{
//freecode项目抽取结果中只有创建时间可以利用创建时间较早的认为比较可信
DateFormat df1 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
DateFormat df2 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
List<String> time1 = paltformDao.getTime("freecode_project", "extracted_time", model1.getUrl_md5());
List<String> time2 = paltformDao.getTime("freecode_project", "extracted_time",model2.getUrl_md5());
if(time1!=null && time2!=null)
try {
Date dt1 = df1.parse(time1.get(0));
Date dt2 = df2.parse(time2.get(0));
if(dt1 !=null && dt2!=null)
{
if(dt1.getTime() > dt2.getTime())
return model1;
else
return model2;
}
} catch (ParseException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
else
if(time1 != null)
return model1;
else
return model2;
}
}
}
else
return communityValue1 > communityValue2 ? model1:model2;
return betterPrj;
}
catch(Exception e)
{
e.printStackTrace();
return null;
}
}
/**
* 根据项目的不同社区赋给其不同的社区热度值
* */
private static int getCommunityValue(GatherProjectsModel model) {
// TODO Auto-generated method stub
String source = model.getSource();
if(source.equals("oschina"))
return 4;
if(source.equals("openhub"))
return 3;
if(source.equals("sourceforge"))
return 2;
if(source.equals("github"))
return 1;
if(source.equals("freecode"))
return 0;
return -1;
}
public static void main(String[] args) {
ApplicationContext applicationContext = new ClassPathXmlApplicationContext(
"classpath:/applicationContext*.xml");
GetSynonyms Main = applicationContext.getBean(GetSynonyms.class);
Main.start();
}
public Map<String, Integer> getSynonyms(GatherProjectsModel model) {
if(model.getSource().equals("github"))
{
return getSynonymsFromGithubPrj(model);//处理github项目
}
else
{
return getSynonymsNotGitPrj(model);//处理非github的其他项目
}
}
@SuppressWarnings("finally")
public Map<String, Integer> getSynonymsFromGithubPrj(GatherProjectsModel model)
{
model.setName(model.getName().trim());
if(model.getDescription() != null)
{
model.setDescription(model.getDescription().trim());
}
else
{
model.setDescription("");
}
//记录每个别名及其准确度<Sysnonym,1>表时Synonym可靠性很大
Map<String,Integer> SynonymMap = new HashMap<String,Integer>();
//最终返回的可以确定正确的同义词列表
List<String> synonymsList = new ArrayList<String>();
String description = model.getDescription();
String prjName = model.getName().toLowerCase();
try
{
//github项目名称中有斜杠提取斜杠后内容做为别名
String subName = getSubName(prjName);
if(subName != null)
{
System.out.println("从名字中提取到:" + subName);
SynonymMap.put(subName, 1);
}
//将subName中短横线下划线用空格代替以及将短横线下划线删除后获取的字符串当做别名
String strReplace = ReplaceSpecial(subName);
if(!strReplace.equals(subName))
{
synonymsList.add(strReplace);
}
//删除特殊符号如短横线下划线,如果其包含在description中则做为别名
String strDelete = DeleteSpecial(subName);
if(!strDelete.equals(subName))
{
if(description.toLowerCase().contains(strDelete.toLowerCase()))
{
synonymsList.add(strDelete);
}
}
if(description == null)
{
//如果描述信息为空别名就只包括已经提取出的和从名字中获取的别名
if(synonymsList != null)
for(String sure:synonymsList)
SynonymMap.put(sure, 1);
return SynonymMap;
}
//描述信息非空时尝试从description中提取别名
Map<String, Integer> synonymsFromDesc = getSynonymsFromDescOfGitprj(description,model.getName());
//将别名连同其确定性一起放入SynonymMap中
for(String tmpSyn:synonymsList)
SynonymMap.put(tmpSyn, 1);
//将从描述信息中获取的别名及其确定性放入SynonymMap
if(synonymsFromDesc != null)
for(String key:synonymsFromDesc.keySet())
SynonymMap.put(key, synonymsFromDesc.get(key));
return SynonymMap;
}catch(Exception e)
{
e.printStackTrace();
logger.error("Error while extracting synonyms from project:" + model.getId());
System.exit(0);
}finally
{
return SynonymMap;
}
}
private Map<String, Integer> getSynonymsNotGitPrj(GatherProjectsModel model) {
model.setName(model.getName().trim());
if(model.getDescription() != null)
model.setDescription(model.getDescription().trim());
List<String> synonymsList = new ArrayList<String>();//能够确定正确的别名
List<String> synonymsList_notsure = new ArrayList<String>();//不能百分百确定的别名
String description = model.getDescription();
String prjName = model.getName();
List<String> result = null;
if(description == null)
{
//如果description为null则别名无需再做改动直接返回null
return null;
}
List<String> linkNameList = new ArrayList<String>();//用于存储所有提取出来的可能是别名的词
List<String> keywords = FileReader.read("./files/language.txt");//用于过滤类似于"Java","C++"这样的词
projectNameWithoutBracket = StringHandler_ProjectName.removeBracket(prjName);
projectNameWithoutComName = StringHandler_ProjectName.getProjectWithoutComName(projectNameWithoutBracket);
//提取项目名括号中的信息不是项目类型
bracket = StringHandler_ProjectName.getBracket(prjName);
//去除括号及括号内容后即可做为别名
if(!"".equals(projectNameWithoutComName))
{
synonymsList.add(projectNameWithoutComName);
logger.info("gets synonyms:" + projectNameWithoutComName + " Without ComName and remove racket");
}
try{
//TODO 将对项目名称的处理过程独立为一个函数
getSynonymsFromName(synonymsList,linkNameList,keywords);
}catch(Exception e)
{
System.out.println("1" + e);
System.exit(0);
}
try{
if(description != null)
getSynonymsFromDescOfNotGitprj(model,description,prjName,synonymsList,linkNameList,keywords);
}catch(Exception e)
{
System.out.println("2"+ model.getId());
System.exit(0);
}
try{
//result包含去掉公司信息括号内的be动词之前的
result = removeDupStr(linkNameList);
}catch(Exception e)
{
System.out.println("3");
System.exit(0);
}
try{
//TODO 将从linkName中提取全/缩写独立出来
getSynonymsByFullOrShort(result,synonymsList);
}catch(Exception e)
{
System.out.println("4");
System.exit(0);
}
//TODO 将从linkName中依据相似度提取别名的程序独立出来
getSynonymsBySimilarDegree(model.getName(),result,synonymsList_notsure,keywords,"notGitPrj");
return afterExtract(synonymsList, synonymsList_notsure);
}
public void getSynonymsFromName(List<String> synonymsList,List<String> linkNameList,List<String> keywords)
{
if(bracket != null && projectNameWithoutBracket != null && projectNameWithoutComName != null)
{
// Map<String,Integer> SynonymMap = new HashMap<String,Integer>();
if(!"".equals(bracket) && !projectNameWithoutBracket.equals("") && !projectNameWithoutComName.equals(""))
{
bracket = RegexHandler.extractEngDecChiAndDot(bracket);//去除括号中信息的特殊符号
if(!RegexHandler.onlySpecialChar(bracket))
{
if(!keywords.contains(bracket.toLowerCase()))//括号中的信息不是语言类关键字词
{
//如果括号中内容个项目名存在全/缩写关系
if(StringHandler.isFullOrShort(bracket, projectNameWithoutBracket))
{
synonymsList.add(bracket);
//SynonymMap.put(bracket, 1);
}
else
{
if(StringHandler.specialIsFullOrShort(bracket, projectNameWithoutBracket))
synonymsList.add(bracket);
else linkNameList.add(bracket);
}
}
}
}
}
}
public void getSynonymsBySimilarDegree(String Name,List<String> result,List<String> synonymsList_notsure,List<String> keywords,String prjSource)
{
//同义词要和项目名称有一定的相似度
for(int i = 0; i < result.size(); i++){
String extract = "";
String synonym = result.get(i);
String[] words = synonym.split(" ");//按照空格进行分词
for(String word:words){
if(Name.toLowerCase().indexOf(word.toLowerCase()) >= 0){//查看每个词在项目名中是否出现
extract += word + " ";
}
}
if(extract.length() !=0)
extract = extract.substring(0, extract.length() - 1);
//logger.info("extract : "+extract);
if(!"".equals(extract) && !keywords.contains(extract))
{
boolean AbleToCalcSimilar = false;
if(prjSource.equals("github"))
{
String strReplace = ReplaceSpecial(getSubName(Name));
if(!extract.toLowerCase().equals(Name.toLowerCase().trim()) && !extract.toLowerCase().trim().equals(strReplace.toLowerCase().trim()))
AbleToCalcSimilar = true;
}
if(!prjSource.equals("notGitPrj"))
{
if(Name!= null && projectNameWithoutBracket != null)
if(!extract.toLowerCase().equals(Name.toLowerCase().trim()) && !extract.toLowerCase().trim().equals(projectNameWithoutComName.toLowerCase().trim()))
AbleToCalcSimilar = true;
}
//提取的别名不和项目名重复
if(AbleToCalcSimilar)
{
double similarDegree = 0.0;
if(!prjSource.equals("github")) similarDegree= SimilarDegree(extract,projectNameWithoutComName);
if(prjSource.equals("github")) similarDegree = SimilarDegree(extract, Name);
if(similarDegree >= 0.6 && similarDegree <1.0)
{
logger.info("from extract after similar "+extract);
synonymsList_notsure.add(extract);
}
}
}
}
}
public void getSynonymsByFullOrShort(List<String> result,List<String>synonymsList)
{
String fullName = StringHandler.getFullName(projectNameWithoutComName.toLowerCase(), result);
if(!fullName.equals("")){
synonymsList.add(fullName.toLowerCase());//表示项目名确实是缩写 并提取到了项目全名
}
String shortName = StringHandler.getShortName(projectNameWithoutComName.toLowerCase(), result);
if(!shortName.equals("")){
synonymsList.add(shortName.toLowerCase());//表示项目名确实是全称 并提取到了项目缩写
}
}
public void getSynonymsFromDescOfNotGitprj(GatherProjectsModel model,String description,String prjName,List<String> synonymsList,List<String> linkNameList,List<String> keywords)
{
List<String> beWords = FileReader.read("./files/beVerb.txt");//读取be动词表
//TODO 将对description的分词处理部分独立出来
description = StringHandler.getFirstSentence(description);
System.out.println("get first sentnce");
for(String beWord:beWords){
String linkName = StringHandler.findLinkName(description, beWord);//按系动词表优先级进行匹配
if(linkName == null){//表示没有匹配到当前的系动词 进行下一个匹配
continue;
}
if(linkName.toLowerCase().equals(prjName.toLowerCase()))
continue;
//特征短语去噪
List<String> linkNameResult =getLinkNameResult(linkName);
System.out.println("error :while beword:" + beWord);
handleLinkNameResult(model,linkNameResult,synonymsList,linkNameList,keywords);
System.out.println("it seems to be ok");
}
System.out.println("over");
}
public Map<String, Integer> getSynonymsFromDescOfGitprj(String description,String prjName)
{
List<String> keywords = FileReader.read("./files/language.txt");
List<String> beWords = FileReader.read("./files/beVerb.txt");
//程序中只使用description的第一句进行处理
String descriptionOfFirstSentence = StringHandler.getFirstSentence(description);
//获取斜杠后名字
String subName = getSubName(prjName);
//将subName中的特殊符号用空格代替
String strReplace = ReplaceSpecial(subName);
//将subName中的特殊符号直接删除
String strDelete = DeleteSpecial(subName);
List<String> linkNameList = new ArrayList<String>();
List<String> synonymsList = new ArrayList<String>();//确定正确的别名
List<String> synonymsList_not_sure = new ArrayList<String>();//不确定是否正确但是满足提取规则的别名
// int matchBewords = 0;//指示description中是否有be动词为0则没有匹配到任何be动词大于1则说明有匹配到be动词
List<String> linkNameResult = new ArrayList<String>();//存放description分词后的处理结果
int length = beWords.size();
for(int k=0;k<length;k++)
{
//按系动词表优先级进行匹配
String linkName = StringHandler.findLinkName(descriptionOfFirstSentence, beWords.get(k));
if(linkName == null)
{
if(k < length-1)
continue;
else
{
if(description.length()<100)
linkName = description;
else linkName = description.substring(0, 100);
}
}
//如果提取出的和subName相等就不再进行接下来的判断这在github中普遍存在
if(linkName.toLowerCase().equals(subName.toLowerCase()))
continue;
//对匹配到be动词的部分进行分词处理
linkNameResult = getLinkNameResult(linkName);
if(linkNameResult != null)
{
//将从description中提取出的各个单词组合成长字符串
String strFromDesc="";
for(String name:linkNameResult)
{
logger.info("be 动词之前的有: "+name);
if(!keywords.contains(name.toLowerCase().trim()) && !name.toLowerCase().trim().equals(subName.toLowerCase().trim()) )
{
if(DeleteSpecial(name).equals(strDelete) && DeleteSpecial(name)!= null)
{
System.out.println("1位置提取到别名" + name);
synonymsList.add(name);
}
else
{
//提取全缩写形式的别名
String synonymsFromGitPrj = getSynonymByFullOrShortOfGitPrj(strReplace.toLowerCase(),strDelete.toLowerCase(),name.toLowerCase());
if(synonymsFromGitPrj != null)
{
System.out.println("2位置提取到" + synonymsFromGitPrj);
synonymsList.add(synonymsFromGitPrj);
}
else
{
linkNameList.add(name.toLowerCase());
}
}
}
//将经过处理的description再组装成字符串
strFromDesc = strFromDesc + name + " ";
}
strFromDesc = strFromDesc.trim();
}
}
List<String>result = removeDupStr(linkNameList);
getSynonymsBySimilarDegree(subName, result, synonymsList_not_sure, keywords,"github");
return afterExtract(synonymsList,synonymsList_not_sure);
}
public List<String> getLinkNameResult(String linkName)
{
//特征短语去噪
List<String> linkNameResult = EDDHandler.getMainAndViceDescriptionByBracket(linkName);//提取主副描述信息(括号中)
linkNameResult = EDDHandler.getMainAndViceDescriptionByKnowAs(linkNameResult);//提取主副描述信息(known as+连词)
linkNameResult = EDDHandler.removeComma(linkNameResult);//去除定语描述信息(逗号定语)
linkNameResult = EDDHandler.removeDemonstrativeWords(linkNameResult);//去除定语描述信息(定冠词The)
linkNameResult = EDDHandler.removeArrtibutiveClause(linkNameResult);//去除从句描述信息
linkNameResult = EDDHandler.removePronoun(linkNameResult);//去除指示代词
linkNameResult = EDDHandler.removeProject(linkNameResult);//去除项目常用词
linkNameResult = EDDHandler.removePPCombine(linkNameResult);//去除指示代词和项目常用词两者的笛卡尔积
linkNameResult = RegexHandler.extractEngDecChiAndDot(linkNameResult);//提取项目中
linkNameResult = RegexHandler.removeOnlySpecial(linkNameResult);//删除只有特殊字符的同义词
return linkNameResult;
}
public void handleLinkNameResult(GatherProjectsModel model,List<String> linkNameResult,List<String> synonymsList,List<String>linkNameList,List<String>keywords)
{
for(String name:linkNameResult){
logger.info("be 动词之前的有: "+name);
//be动词前不包括项目类型且和项目名不相同
if(!keywords.contains(name.toLowerCase().trim()) && !name.toLowerCase().trim().equals(model.getName().toLowerCase().trim())
&& !name.toLowerCase().trim().equals(projectNameWithoutComName.toLowerCase().trim())){
System.out.println("FUCK1");
//这里的prjName是完整的name
if(model.getName().toLowerCase().indexOf(name.toLowerCase()) != -1){
System.out.println("FUCK2");
String withoutShortDash = RemoveShortDash(name);
System.out.println("FUCK2_2");
if(bracket!=null && !bracket.equals(""))
if(name.toLowerCase().equals(bracket.toLowerCase()) || name.toLowerCase().equals(withoutShortDash))
{
System.out.println("FUCK_2_1");
synonymsList.add(name);
}
else
{
System.out.println("FUCK2_3");
int nameLength1 = withoutShortDash.split(" ").length;
int nameLength2 = projectNameWithoutBracket.split(" ").length;
int nameLength = name.trim().split(" ").length;
double ratio1= (nameLength1/nameLength) * 1.0;
double ratio2 =(nameLength2/nameLength) * 1.0;
if(nameLength1 > 1 && ratio1 > 0.6)
synonymsList.add(name);
else
{
if(nameLength2 > 1 && ratio2 > 0.6)
synonymsList.add(name);
else linkNameList.add(name);
}
System.out.println("FUCK3");
}
}
else
linkNameList.add(name);
}
}
}
//取斜杠后面的字符串如果没有斜杠就返回字符串本身
private String getSubName(String prjName) {
// TODO Auto-generated method stub
String SubName = prjName;
if(prjName.indexOf('/') == -1)
return SubName;
else
SubName = prjName.substring(prjName.indexOf('/') + 1);
return SubName;
}
public static String DeleteSpecial(String subName) {
// TODO Auto-generated method stub
int deleteNum=0;
StringBuffer str=new StringBuffer(subName);
for(int i=0;i< subName.length();i++)
{
//去除下划线
if(subName.charAt(i) == '_')
{
str.deleteCharAt(i-deleteNum);
deleteNum++;
}
//去除短横线
if(subName.charAt(i) == '-')
{
//System.out.println("字符位置在" + i + " and 字符是" + subName.charAt(i));
str.deleteCharAt(i-deleteNum);
deleteNum++;
}
if(subName.charAt(i) == '.')
{
//System.out.println("字符位置在" + i + " and 字符是" + subName.charAt(i));
str.deleteCharAt(i-deleteNum);
deleteNum++;
}
if(subName.charAt(i)== '/')
{
//System.out.println("字符位置在" + i + " and 字符是" + subName.charAt(i));
str.deleteCharAt(i-deleteNum);
deleteNum++;
}
}
return str.toString();
}
public static String ReplaceSpecial(String subName) {
// TODO Auto-generated method stub
StringBuffer str=new StringBuffer(subName);
for(int i=0;i<subName.length();i++)
{
if(subName.charAt(i)== '-')
str.setCharAt(i, ' ');
if(subName.charAt(i) == '_')
str.setCharAt(i, ' ');
}
return str.toString();
}
//将提取出的一定正确的别名及不确定是否正确的别名和其精确度对应起来组装到Map中
public Map<String,Integer>afterExtract(List<String> synonymsList,List<String>synonymsList_not_sure)
{
Map<String,Integer> SynonymMap = new HashMap<String,Integer>();
if(synonymsList != null)
{
synonymsList = removeDupStr(synonymsList);
if(synonymsList != null)
for(String sure:synonymsList)
{
SynonymMap.put(sure, 1);
}
}
if(synonymsList_not_sure != null)
{
synonymsList_not_sure = removeDupStr(synonymsList_not_sure);
for(String Notsure:synonymsList_not_sure)
{
SynonymMap.put(Notsure, 0);
}
}
return SynonymMap;
}
public String getSynonymByFullOrShortOfGitPrj(String subName,String subName2,String descWithoutBracketHandled)
{
if(StringHandler.isFullOrShort(ReplaceSpecial(descWithoutBracketHandled).toLowerCase(), subName.toLowerCase()) ||
StringHandler.specialIsFullOrShortForGitPrj(ReplaceSpecial(descWithoutBracketHandled).toLowerCase(), subName.toLowerCase()))
{
return descWithoutBracketHandled;
//System.out.println("ReplaceSpecial提取到别名" + descWithoutBracketHandled);
}
// if(StringHandler.isFullOrShort(DeleteSpecial(descWithoutBracketHandled).toLowerCase(), subName2.toLowerCase()) ||
// StringHandler.specialIsFullOrShortForGitPrj(DeleteSpecial(descWithoutBracketHandled).toLowerCase(), subName2.toLowerCase()))
// {
// return descWithoutBracketHandled;
// //System.out.println("DeleteSpecial提取到别名" + descWithoutBracketHandled);
// }
return null;
}
public static String RemoveShortDash(String name) {
String withoutShortDash = "";
if (name.indexOf('-') > -1)
withoutShortDash = name.substring(0, name.indexOf('-')).trim();
return withoutShortDash;
}
public List<String> removeDupStr(List<String> array) {
List<String> list = new ArrayList<String>();
Set<String> set = new HashSet<String>();
for (String s : array) {
if (set.add(s.toLowerCase())) {
list.add(s);
}
}
return list;
}
public static double SimilarDegree(String strA, String strB) {
String newStrA = removeSign(strA);
String newStrB = removeSign(strB);
int temp = Math.max(newStrA.length(), newStrB.length());
int temp2 = longestCommonSubstring(newStrA, newStrB).length();
return temp2 * 1.0 / temp;
}
private static String removeSign(String str) {
StringBuffer sb = new StringBuffer();
for (char item : str.toCharArray())
if (charReg(item)) {
sb.append(item);
}
return sb.toString();
}
private static boolean charReg(char charValue) {
return (charValue >= 0x4E00 && charValue <= 0X9FA5)
|| (charValue >= 'a' && charValue <= 'z')
|| (charValue >= 'A' && charValue <= 'Z')
|| (charValue >= '0' && charValue <= '9');
}
private static String longestCommonSubstring(String strA, String strB) {
char[] chars_strA = strA.toCharArray();
char[] chars_strB = strB.toCharArray();
int m = chars_strA.length;
int n = chars_strB.length;
int[][] matrix = new int[m + 1][n + 1];
for (int i = 1; i <= m; i++) {
for (int j = 1; j <= n; j++) {
if (chars_strA[i - 1] == chars_strB[j - 1])
matrix[i][j] = matrix[i - 1][j - 1] + 1;
else
matrix[i][j] = Math.max(matrix[i][j - 1], matrix[i - 1][j]);
}
}
char[] result = new char[matrix[m][n]];
int currentIndex = result.length - 1;
while (matrix[m][n] != 0) {
if (matrix[m][n] == matrix[m][n - 1])
n--;
else if (matrix[m][n] == matrix[m - 1][n])
m--;
else {
result[currentIndex] = chars_strA[m - 1];
currentIndex--;
n--;
m--;
}
}
return new String(result);
}
}

View File

@ -13,6 +13,12 @@ import com.ossean.model.OpenhubUrl;
import com.ossean.model.GatherProjectsModel;
public interface GatherDao {
@Select("select * from ${sourceTable} where id >= #{beginId} and id <= #{endId} and update_mark = 0;")
public List<GatherProjectsModel> selectbatchPrj(@Param("beginId")int beginId, @Param("endId")int endId,@Param("sourceTable") String sourceTable);
@Select("select max(id) from ${table}")
public int getMaxId(@Param("table") String table);
@Select("select min(id) from ${gatherProjectsTableName} where update_mark = 0")
public int selectLastId(@Param("gatherProjectsTableName")String gatherProjectsTableName);
//tag表Insert操作
@Insert("insert ignore into ${table} (name) values (#{name})")