From 187a25fe4d29f4bc8161f9d0fbe8b267463cd83f Mon Sep 17 00:00:00 2001 From: kevin <1141706615@qq.com> Date: Wed, 22 Mar 2017 17:09:30 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E5=88=AB=E5=90=8D=E6=8F=90?= =?UTF-8?q?=E5=8F=96=E4=B8=BA=E5=A4=9A=E7=BA=BF=E7=A8=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../resources/applicationContext_mybatis.xml | 2 +- .../bin/start_get_synonyms_mulitiprocess.sh | 16 + .../src/main/java/com/ossean/AppContext.java | 9 + .../java/com/ossean/GetSynonymsProcess.java | 82 ++ .../java/com/ossean/GetSynonymsThread.java | 986 ++++++++++++++++++ .../main/java/com/ossean/dao/GatherDao.java | 6 + 6 files changed, 1100 insertions(+), 1 deletion(-) create mode 100644 project_match/bin/start_get_synonyms_mulitiprocess.sh create mode 100644 project_match/src/main/java/com/ossean/AppContext.java create mode 100644 project_match/src/main/java/com/ossean/GetSynonymsProcess.java create mode 100644 project_match/src/main/java/com/ossean/GetSynonymsThread.java diff --git a/project_match/bin/resources/applicationContext_mybatis.xml b/project_match/bin/resources/applicationContext_mybatis.xml index 8a978e0..6cc415e 100644 --- a/project_match/bin/resources/applicationContext_mybatis.xml +++ b/project_match/bin/resources/applicationContext_mybatis.xml @@ -52,7 +52,7 @@ - + diff --git a/project_match/bin/start_get_synonyms_mulitiprocess.sh b/project_match/bin/start_get_synonyms_mulitiprocess.sh new file mode 100644 index 0000000..af6c689 --- /dev/null +++ b/project_match/bin/start_get_synonyms_mulitiprocess.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +find ./target/classes -name "*.properties"|xargs rm -f +find ./target/classes -name "*.xml"|xargs rm -f +find ./target/classes -name "*.dic"|xargs rm -f + +#export CLASSPATH=$CURR_DIR/lib:$CURR_DIR:$JAVA_HOME/lib:$JAVA_HOME/jre/lib + +tmp='./target/classes':$tmp +tmp='./target/Project_Match-0.0.1-SNAPSHOT-jar-with-dependencies-without-resources/*':$tmp +tmp='./bin/resources':$tmp +CLASSPATH=$tmp:$CLASSPATH + + +echo $CLASSPATH +java -classpath $CLASSPATH com.ossean.GetSynonymsProcess >>log/getsynonyms_2017.log 2>&1 & diff --git a/project_match/src/main/java/com/ossean/AppContext.java b/project_match/src/main/java/com/ossean/AppContext.java new file mode 100644 index 0000000..dc5e4e8 --- /dev/null +++ b/project_match/src/main/java/com/ossean/AppContext.java @@ -0,0 +1,9 @@ +package com.ossean; +import org.springframework.context.ApplicationContext; +import org.springframework.context.support.ClassPathXmlApplicationContext; + +public class AppContext { + + static ApplicationContext appContext = new ClassPathXmlApplicationContext( + "classpath:/applicationContext*.xml"); +} diff --git a/project_match/src/main/java/com/ossean/GetSynonymsProcess.java b/project_match/src/main/java/com/ossean/GetSynonymsProcess.java new file mode 100644 index 0000000..6186448 --- /dev/null +++ b/project_match/src/main/java/com/ossean/GetSynonymsProcess.java @@ -0,0 +1,82 @@ +package com.ossean; + +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; + +import javax.annotation.Resource; + +import org.apache.log4j.Logger; +import org.springframework.context.ApplicationContext; +import org.springframework.context.support.ClassPathXmlApplicationContext; +import org.springframework.stereotype.Component; +import org.springframework.stereotype.Component; + +import com.ossean.dao.DBDest; +import com.ossean.dao.DBSource; +import com.ossean.dao.GatherDao; +import com.ossean.dao2.PlatformProjectDao; + +@Component +public class GetSynonymsProcess { + + Logger logger = Logger.getLogger(this.getClass()); + @Resource + private DBSource dbSource; + @Resource + public DBDest dbDest; + @Resource + private GatherDao gatherDao; + @Resource + private PlatformProjectDao paltformDao; + //创建进程池,一次20个进程 + private ExecutorService pool = Executors.newFixedThreadPool(20); + + private static String gatherProjectsTableName ="gather_projects_test";//TableName.gatherProjectsTableName; + private static String synonymsTableName = TableName.synonymsTableName; + private static String pointerTableName = TableName.pointerTableName; + private static int batchSize = 10000; + + public void start() throws InterruptedException + { + int lastId = gatherDao.selectLastId(gatherProjectsTableName); + int maxId = gatherDao.getMaxId(gatherProjectsTableName); + System.out.println("lastId:" + lastId + ",maxId:" + maxId); + Thread.sleep(1000); + + int begin = lastId; + int end; + while(begin < maxId) + { + end = begin + batchSize -1; + if(end < maxId) + { + GetSynonymsThread getSynonymsThread = (GetSynonymsThread)AppContext.appContext.getBean("GetSynonymsThread"); + getSynonymsThread.setBorder(begin, end); + getSynonymsThread.setSourceTable(gatherProjectsTableName); + //getSynonymsThread.run(); + pool.execute(getSynonymsThread); + begin = begin + batchSize; + } + else + { + end = maxId; + GetSynonymsThread getSynonymsThread = (GetSynonymsThread)AppContext.appContext.getBean("GetSynonymsThread"); + getSynonymsThread.setBorder(begin, end); + getSynonymsThread.setSourceTable(gatherProjectsTableName); + pool.execute(getSynonymsThread); + //getSynonymsThread.run(); + break; + } + } + + + } + + public static void main(String[] args) throws InterruptedException { + // TODO Auto-generated method stub + ApplicationContext applicationContext = new ClassPathXmlApplicationContext( + "classpath:/applicationContext*.xml"); + GetSynonymsProcess Main = applicationContext.getBean(GetSynonymsProcess.class); + Main.start(); + } +} diff --git a/project_match/src/main/java/com/ossean/GetSynonymsThread.java b/project_match/src/main/java/com/ossean/GetSynonymsThread.java new file mode 100644 index 0000000..ddd269a --- /dev/null +++ b/project_match/src/main/java/com/ossean/GetSynonymsThread.java @@ -0,0 +1,986 @@ +package com.ossean; + +import java.text.DateFormat; +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Date; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import javax.annotation.Resource; + +import org.apache.log4j.Logger; +import org.springframework.context.ApplicationContext; +import org.springframework.context.annotation.Scope; +import org.springframework.context.support.ClassPathXmlApplicationContext; +import org.springframework.stereotype.Component; +import org.springframework.transaction.annotation.Propagation; +import org.springframework.transaction.annotation.Transactional; + +import com.ossean.dao.DBDest; +import com.ossean.dao.DBSource; +import com.ossean.dao.GatherDao; +import com.ossean.dao2.PlatformProjectDao; +import com.ossean.model.GatherProjectsModel; +import com.ossean.util.EDDHandler; +import com.ossean.util.FileReader; +import com.ossean.util.RegexHandler; +import com.ossean.util.StringHandler; +import com.ossean.util.StringHandler_ProjectName; + +@Component("GetSynonymsThread") +@Scope("prototype") +public class GetSynonymsThread implements Runnable { + + Logger logger = Logger.getLogger(this.getClass()); + @Resource + private DBSource dbSource; + @Resource + public DBDest dbDest; + @Resource + private GatherDao gatherDao; + @Resource + private PlatformProjectDao paltformDao; + + private static String gatherProjectsTableName ="gather_031922";//TableName.gatherProjectsTableName; + private static String synonymsTableName = TableName.synonymsTableName; + private static String pointerTableName = TableName.pointerTableName; + private static int batchSize = 1000; + + + //为了很好的将别名提取的程序结构化,特设以下三个全局变量在从项目名称中提取别名时使用 + String projectNameWithoutBracket = "";//项目名去除括号 + String projectNameWithoutComName = "";//项目名称去除括号和公司、基金名字 + String bracket = "";//括号中内容 + private int ids_begin; + private int ids_end; + private String sourceTable; + public void setBorder(int ids_begin,int ids_end) + { + this.ids_begin = ids_begin; + this.ids_end = ids_end; + } + public void setSourceTable(String sourceTable) + { + this.sourceTable = sourceTable; + } + + int beginId; + int endId; + int idIncrement = 1000; + @Override + public void run() + { + System.out.println("beginId:" + this.ids_begin + ",endId:" + this.ids_end); + try { + Thread.sleep(1000); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + logger.info("进程:" + sourceTable + ":" + this.ids_begin + "to:" + this.ids_end + "开始运行!"); + + while(ids_begin < ids_end) + {System.out.println("*************************************************************"); + beginId = ids_begin; + endId = beginId + idIncrement -1; + + if(endId <= ids_end) + { + batchGetSynonyms(beginId,endId); + ids_begin = ids_begin + idIncrement; + } + else + { + endId = ids_end; + batchGetSynonyms(beginId,endId); + break; + } + + + } + + } + + private void batchGetSynonyms(int beginId2, int endId2) { + // TODO Auto-generated method stub + List gpmList = gatherDao.selectbatchPrj(beginId,endId,sourceTable); + for(GatherProjectsModel model : gpmList) + { + Map synonymMap = getSynonyms(model); + synonymMap = synonymsFilter(synonymMap,synonymsTableName,model); + updateTable(synonymMap,model); + + } + + } + @Transactional(propagation=Propagation.REQUIRED) + public void updateTable(Map synonymsList,GatherProjectsModel model){ + if(synonymsList !=null && synonymsList.size()!=0) + for (String name : synonymsList.keySet()) { + dbSource.insertSynonymsNew(synonymsTableName, + model.getId(), name, synonymsList.get(name)); + } + // 更新update_mark=1 + gatherDao.updateMark(gatherProjectsTableName, 1, model.getId()); + } + public Map synonymsFilter(Map synonymMap,String targetTable,GatherProjectsModel model) + { + try + { + Map copy = synonymMap; + if(copy != null) + for(String key:copy.keySet()) + { + if(copy.get(key) == 1) + { + GatherProjectsModel tmp = model; + //通过别名查找别名表中具有相同别名且flag为1的项目Id + Set dupSynonymPrj =getSameSynonymPrj(key,targetTable); + + //GitHub社区项目中存在A-B-C,A B C这种,需要把短横、下划线去掉后再次判别是否相同。 + String replSpecial = ReplaceSpecial(key); + if(!replSpecial.equals(key)) + { + Set dupSynonymsPrj2 = getSameSynonymPrj(replSpecial, targetTable); + if(dupSynonymsPrj2 != null) + { + if(dupSynonymPrj != null && dupSynonymsPrj2 != null) + dupSynonymPrj.addAll(dupSynonymsPrj2); + else + dupSynonymPrj = dupSynonymsPrj2; + + } + } + + if(dupSynonymPrj != null) + { + Iterator iterator = dupSynonymPrj.iterator(); + while(iterator.hasNext()) + { + GatherProjectsModel prj = dbSource.getGatherProjectById(gatherProjectsTableName ,iterator.next()); + //迭代求取最优项目,保证tmp始终代表最适合这一别名的项目 + if(prj != null) + tmp = getBetterPrj(tmp,prj); + } + } + + if(tmp.getId()==model.getId() && tmp != null) + { + //当前项目为最适合这一别名的项目,就更新别名表中其余项目的flag为0 + dbDest.updateSynoymmings(targetTable,key,model.getId(),0); + } + else + { + synonymMap.put(key, 0); + dbDest.updateSynoymmings(targetTable, key, tmp.getId(), 0); + } + } + + + } + + + return synonymMap; + } + catch(Exception e) + { + e.printStackTrace(); + return null; + } + + } + + public Set getSameSynonymPrj(String key,String targetTable) + { + try + { + Set dupSynonymPrj = new HashSet(); + dupSynonymPrj = dbDest.selectSameSynonymPrj(targetTable,key,1); + + return dupSynonymPrj; + }catch(Exception e) + { + e.printStackTrace(); + return null; + } + } + + /** + * 比较两个项目哪一个更适合某一别名 + * */ + private GatherProjectsModel getBetterPrj(GatherProjectsModel model1, GatherProjectsModel model2) { + try + { + + // TODO Auto-generated method stub + GatherProjectsModel betterPrj=null; + //不同的社区赋给不同的值 + int communityValue1 = getCommunityValue(model1); + int communityValue2 = getCommunityValue(model2); + //如果来自同一社区就从抽取数据库中查询其有代表意义的数据比较热度 + if(communityValue1 == communityValue2) + { + if(communityValue1 == -1) + return model1; + else + { + if(communityValue1 == 4) + { + /**oschina用star_num具有代表性, + **用List是因为抽取数据库中未经汇总,有相同项目存在,只需求出list取第一个值即可 + **/ + List hot1 = paltformDao.getHotValue("oschina_project","star_num",model1.getUrl_md5()); + List hot2 = paltformDao.getHotValue("oschina_project","star_num",model2.getUrl_md5()); + if(hot1.size()>0 && hot2.size()>0)return hot1.get(0) > hot2.get(0) ? model1:model2; + else + if(hot1.size() == 0) + return model2; + else + return model1; + + } + if(communityValue1 == 3) + { + List hot1 = paltformDao.getHotValue("openhub_project","user_num",model1.getUrl_md5()); + List hot2 = paltformDao.getHotValue("openhub_project","user_num",model2.getUrl_md5()); + if(hot1.size()>0 && hot2.size()>0)return hot1.get(0) > hot2.get(0) ? model1:model2; + else + if(hot1.size() == 0) + return model2; + else + return model1; + } + if(communityValue1 == 2) + { + List hot1 = paltformDao.getHotValue("sourceforge_project","download_num",model1.getUrl_md5()); + List hot2 = paltformDao.getHotValue("sourceforge_project","download_num",model2.getUrl_md5()); + if(hot1.size()>0 && hot2.size()>0) + return hot1.get(0) > hot2.get(0) ? model1:model2; + else + if(hot1.size() == 0) + return model2; + else + return model1; + } + if(communityValue1 == 1) + { + List hot1 = paltformDao.getHotValue("github_tmp","forks",model1.getUrl_md5()); + List hot2 = paltformDao.getHotValue("github_tmp","forks",model2.getUrl_md5()); + if(hot1.size()>0 && hot2.size()>0) + return hot1.get(0) > hot2.get(0) ? model1:model2; + else + if(hot1== null || hot1.size()==0) + return model2; + else + return model1; + } + + if(communityValue1 == 0) + { + //freecode项目抽取结果中只有创建时间可以利用,创建时间较早的认为比较可信 + DateFormat df1 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); + DateFormat df2 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); + List time1 = paltformDao.getTime("freecode_project", "extracted_time", model1.getUrl_md5()); + List time2 = paltformDao.getTime("freecode_project", "extracted_time",model2.getUrl_md5()); + if(time1!=null && time2!=null) + try { + Date dt1 = df1.parse(time1.get(0)); + Date dt2 = df2.parse(time2.get(0)); + if(dt1 !=null && dt2!=null) + { + if(dt1.getTime() > dt2.getTime()) + return model1; + else + return model2; + } + } catch (ParseException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + else + if(time1 != null) + return model1; + else + return model2; + + } + } + } + else + return communityValue1 > communityValue2 ? model1:model2; + + return betterPrj; + + } + catch(Exception e) + { + e.printStackTrace(); + return null; + } + } + /** + * 根据项目的不同社区,赋给其不同的社区热度值 + * */ + private static int getCommunityValue(GatherProjectsModel model) { + // TODO Auto-generated method stub + String source = model.getSource(); + + if(source.equals("oschina")) + return 4; + if(source.equals("openhub")) + return 3; + if(source.equals("sourceforge")) + return 2; + if(source.equals("github")) + return 1; + if(source.equals("freecode")) + return 0; + + return -1; + } + + public static void main(String[] args) { + ApplicationContext applicationContext = new ClassPathXmlApplicationContext( + "classpath:/applicationContext*.xml"); + GetSynonyms Main = applicationContext.getBean(GetSynonyms.class); + Main.start(); + + } + + public Map getSynonyms(GatherProjectsModel model) { + if(model.getSource().equals("github")) + { + return getSynonymsFromGithubPrj(model);//处理github项目 + } + else + { + return getSynonymsNotGitPrj(model);//处理非github的其他项目 + } + } + + @SuppressWarnings("finally") + public Map getSynonymsFromGithubPrj(GatherProjectsModel model) + { + + model.setName(model.getName().trim()); + if(model.getDescription() != null) + { + model.setDescription(model.getDescription().trim()); + } + else + { + model.setDescription(""); + } + + //记录每个别名及其准确度,表时Synonym可靠性很大 + Map SynonymMap = new HashMap(); + + //最终返回的可以确定正确的同义词列表 + List synonymsList = new ArrayList(); + + String description = model.getDescription(); + String prjName = model.getName().toLowerCase(); + + try + { + //github项目名称中有斜杠,提取斜杠后内容做为别名 + String subName = getSubName(prjName); + if(subName != null) + { + System.out.println("从名字中提取到:" + subName); + SynonymMap.put(subName, 1); + } + + + //将subName中短横线、下划线用空格代替以及将短横线、下划线删除后获取的字符串当做别名 + String strReplace = ReplaceSpecial(subName); + if(!strReplace.equals(subName)) + { + synonymsList.add(strReplace); + } + + //删除特殊符号如短横线、下划线,如果其包含在description中则做为别名 + String strDelete = DeleteSpecial(subName); + if(!strDelete.equals(subName)) + { + if(description.toLowerCase().contains(strDelete.toLowerCase())) + { + synonymsList.add(strDelete); + } + } + + + if(description == null) + { + //如果描述信息为空,别名就只包括已经提取出的和从名字中获取的别名 + if(synonymsList != null) + for(String sure:synonymsList) + SynonymMap.put(sure, 1); + + return SynonymMap; + } + + //描述信息非空时,尝试从description中提取别名 + Map synonymsFromDesc = getSynonymsFromDescOfGitprj(description,model.getName()); + + //将别名连同其确定性一起放入SynonymMap中 + for(String tmpSyn:synonymsList) + SynonymMap.put(tmpSyn, 1); + + //将从描述信息中获取的别名及其确定性放入SynonymMap + if(synonymsFromDesc != null) + for(String key:synonymsFromDesc.keySet()) + SynonymMap.put(key, synonymsFromDesc.get(key)); + + return SynonymMap; + }catch(Exception e) + { + e.printStackTrace(); + logger.error("Error while extracting synonyms from project:" + model.getId()); + System.exit(0); + }finally + { + return SynonymMap; + + } + + } + + private Map getSynonymsNotGitPrj(GatherProjectsModel model) { + model.setName(model.getName().trim()); + if(model.getDescription() != null) + model.setDescription(model.getDescription().trim()); + + List synonymsList = new ArrayList();//能够确定正确的别名 + List synonymsList_notsure = new ArrayList();//不能百分百确定的别名 + + String description = model.getDescription(); + String prjName = model.getName(); + + List result = null; + if(description == null) + { + //如果description为null,则别名无需再做改动直接返回null + return null; + } + + List linkNameList = new ArrayList();//用于存储所有提取出来的可能是别名的词 + + List keywords = FileReader.read("./files/language.txt");//用于过滤类似于"Java","C++"这样的词 + projectNameWithoutBracket = StringHandler_ProjectName.removeBracket(prjName); + projectNameWithoutComName = StringHandler_ProjectName.getProjectWithoutComName(projectNameWithoutBracket); + //提取项目名括号中的信息,不是项目类型 + bracket = StringHandler_ProjectName.getBracket(prjName); + + //去除括号及括号内容后即可做为别名 + if(!"".equals(projectNameWithoutComName)) + { + synonymsList.add(projectNameWithoutComName); + logger.info("gets synonyms:" + projectNameWithoutComName + " Without ComName and remove racket"); + } + + + + try{ + //TODO 将对项目名称的处理过程独立为一个函数 + getSynonymsFromName(synonymsList,linkNameList,keywords); + }catch(Exception e) + { + System.out.println("1" + e); + System.exit(0); + } + + try{ + if(description != null) + getSynonymsFromDescOfNotGitprj(model,description,prjName,synonymsList,linkNameList,keywords); + }catch(Exception e) + { + System.out.println("2"+ model.getId()); + System.exit(0); + } + + + + try{ + //result包含去掉公司信息、括号内的、be动词之前的 + result = removeDupStr(linkNameList); + }catch(Exception e) + { + System.out.println("3"); + System.exit(0); + } + + try{ + //TODO 将从linkName中提取全/缩写独立出来 + getSynonymsByFullOrShort(result,synonymsList); + }catch(Exception e) + { + System.out.println("4"); + System.exit(0); + } + //TODO 将从linkName中依据相似度提取别名的程序独立出来 + getSynonymsBySimilarDegree(model.getName(),result,synonymsList_notsure,keywords,"notGitPrj"); + + + return afterExtract(synonymsList, synonymsList_notsure); + } + + public void getSynonymsFromName(List synonymsList,List linkNameList,List keywords) + { + if(bracket != null && projectNameWithoutBracket != null && projectNameWithoutComName != null) + { + // Map SynonymMap = new HashMap(); + if(!"".equals(bracket) && !projectNameWithoutBracket.equals("") && !projectNameWithoutComName.equals("")) + { + bracket = RegexHandler.extractEngDecChiAndDot(bracket);//去除括号中信息的特殊符号 + if(!RegexHandler.onlySpecialChar(bracket)) + { + if(!keywords.contains(bracket.toLowerCase()))//括号中的信息不是语言类关键字词 + { + //如果括号中内容个项目名存在全/缩写关系 + if(StringHandler.isFullOrShort(bracket, projectNameWithoutBracket)) + { + synonymsList.add(bracket); + //SynonymMap.put(bracket, 1); + } + else + { + if(StringHandler.specialIsFullOrShort(bracket, projectNameWithoutBracket)) + synonymsList.add(bracket); + else linkNameList.add(bracket); + } + + } + } + } + } + } + + public void getSynonymsBySimilarDegree(String Name,List result,List synonymsList_notsure,List keywords,String prjSource) + { + //同义词要和项目名称有一定的相似度 + for(int i = 0; i < result.size(); i++){ + String extract = ""; + String synonym = result.get(i); + String[] words = synonym.split(" ");//按照空格进行分词 + for(String word:words){ + if(Name.toLowerCase().indexOf(word.toLowerCase()) >= 0){//查看每个词在项目名中是否出现 + extract += word + " "; + } + } + if(extract.length() !=0) + extract = extract.substring(0, extract.length() - 1); + //logger.info("extract : "+extract); + if(!"".equals(extract) && !keywords.contains(extract)) + { + boolean AbleToCalcSimilar = false; + if(prjSource.equals("github")) + { + String strReplace = ReplaceSpecial(getSubName(Name)); + if(!extract.toLowerCase().equals(Name.toLowerCase().trim()) && !extract.toLowerCase().trim().equals(strReplace.toLowerCase().trim())) + AbleToCalcSimilar = true; + } + if(!prjSource.equals("notGitPrj")) + { + if(Name!= null && projectNameWithoutBracket != null) + if(!extract.toLowerCase().equals(Name.toLowerCase().trim()) && !extract.toLowerCase().trim().equals(projectNameWithoutComName.toLowerCase().trim())) + AbleToCalcSimilar = true; + } + //提取的别名不和项目名重复 + if(AbleToCalcSimilar) + { + double similarDegree = 0.0; + if(!prjSource.equals("github")) similarDegree= SimilarDegree(extract,projectNameWithoutComName); + if(prjSource.equals("github")) similarDegree = SimilarDegree(extract, Name); + if(similarDegree >= 0.6 && similarDegree <1.0) + { + logger.info("from extract after similar: "+extract); + synonymsList_notsure.add(extract); + } + } + } + } + } + + public void getSynonymsByFullOrShort(List result,ListsynonymsList) + { + String fullName = StringHandler.getFullName(projectNameWithoutComName.toLowerCase(), result); + if(!fullName.equals("")){ + synonymsList.add(fullName.toLowerCase());//表示项目名确实是缩写 并提取到了项目全名 + } + + String shortName = StringHandler.getShortName(projectNameWithoutComName.toLowerCase(), result); + if(!shortName.equals("")){ + synonymsList.add(shortName.toLowerCase());//表示项目名确实是全称 并提取到了项目缩写 + } + } + + public void getSynonymsFromDescOfNotGitprj(GatherProjectsModel model,String description,String prjName,List synonymsList,List linkNameList,List keywords) + { + + + List beWords = FileReader.read("./files/beVerb.txt");//读取be动词表 + //TODO 将对description的分词处理部分独立出来 + description = StringHandler.getFirstSentence(description); + System.out.println("get first sentnce"); + for(String beWord:beWords){ + + String linkName = StringHandler.findLinkName(description, beWord);//按系动词表优先级进行匹配 + if(linkName == null){//表示没有匹配到当前的系动词 进行下一个匹配 + continue; + } + if(linkName.toLowerCase().equals(prjName.toLowerCase())) + continue; + //特征短语去噪 + List linkNameResult =getLinkNameResult(linkName); + + System.out.println("error :while beword:" + beWord); + handleLinkNameResult(model,linkNameResult,synonymsList,linkNameList,keywords); + System.out.println("it seems to be ok"); + } + System.out.println("over"); + + } + public Map getSynonymsFromDescOfGitprj(String description,String prjName) + { + List keywords = FileReader.read("./files/language.txt"); + List beWords = FileReader.read("./files/beVerb.txt"); + //程序中只使用description的第一句进行处理。 + String descriptionOfFirstSentence = StringHandler.getFirstSentence(description); + //获取斜杠后名字 + String subName = getSubName(prjName); + + //将subName中的特殊符号用空格代替 + String strReplace = ReplaceSpecial(subName); + //将subName中的特殊符号直接删除 + String strDelete = DeleteSpecial(subName); + List linkNameList = new ArrayList(); + List synonymsList = new ArrayList();//确定正确的别名 + List synonymsList_not_sure = new ArrayList();//不确定是否正确但是满足提取规则的别名 + +// int matchBewords = 0;//指示description中是否有be动词,为0则没有匹配到任何be动词,大于1则说明有匹配到be动词 + List linkNameResult = new ArrayList();//存放description分词后的处理结果 + int length = beWords.size(); + for(int k=0;kresult = removeDupStr(linkNameList); + getSynonymsBySimilarDegree(subName, result, synonymsList_not_sure, keywords,"github"); + return afterExtract(synonymsList,synonymsList_not_sure); + } + + public List getLinkNameResult(String linkName) + { + //特征短语去噪 + List linkNameResult = EDDHandler.getMainAndViceDescriptionByBracket(linkName);//提取主副描述信息(括号中) + linkNameResult = EDDHandler.getMainAndViceDescriptionByKnowAs(linkNameResult);//提取主副描述信息(known as+连词) + linkNameResult = EDDHandler.removeComma(linkNameResult);//去除定语描述信息(逗号定语) + linkNameResult = EDDHandler.removeDemonstrativeWords(linkNameResult);//去除定语描述信息(定冠词The) + linkNameResult = EDDHandler.removeArrtibutiveClause(linkNameResult);//去除从句描述信息 + linkNameResult = EDDHandler.removePronoun(linkNameResult);//去除指示代词 + linkNameResult = EDDHandler.removeProject(linkNameResult);//去除项目常用词 + linkNameResult = EDDHandler.removePPCombine(linkNameResult);//去除指示代词和项目常用词两者的笛卡尔积 + linkNameResult = RegexHandler.extractEngDecChiAndDot(linkNameResult);//提取项目中 + linkNameResult = RegexHandler.removeOnlySpecial(linkNameResult);//删除只有特殊字符的同义词 + + return linkNameResult; + } + + public void handleLinkNameResult(GatherProjectsModel model,List linkNameResult,List synonymsList,ListlinkNameList,Listkeywords) + { + for(String name:linkNameResult){ + logger.info("be 动词之前的有: "+name); + //be动词前不包括项目类型,且和项目名不相同 + if(!keywords.contains(name.toLowerCase().trim()) && !name.toLowerCase().trim().equals(model.getName().toLowerCase().trim()) + && !name.toLowerCase().trim().equals(projectNameWithoutComName.toLowerCase().trim())){ + System.out.println("FUCK1"); + //这里的prjName是完整的name + if(model.getName().toLowerCase().indexOf(name.toLowerCase()) != -1){ + System.out.println("FUCK2"); + String withoutShortDash = RemoveShortDash(name); + System.out.println("FUCK2_2"); + if(bracket!=null && !bracket.equals("")) + if(name.toLowerCase().equals(bracket.toLowerCase()) || name.toLowerCase().equals(withoutShortDash)) + { + System.out.println("FUCK_2_1"); + synonymsList.add(name); + } + else + { + System.out.println("FUCK2_3"); + int nameLength1 = withoutShortDash.split(" ").length; + int nameLength2 = projectNameWithoutBracket.split(" ").length; + int nameLength = name.trim().split(" ").length; + double ratio1= (nameLength1/nameLength) * 1.0; + double ratio2 =(nameLength2/nameLength) * 1.0; + if(nameLength1 > 1 && ratio1 > 0.6) + synonymsList.add(name); + else + { + if(nameLength2 > 1 && ratio2 > 0.6) + synonymsList.add(name); + else linkNameList.add(name); + } + System.out.println("FUCK3"); + } + } + else + linkNameList.add(name); + } + + } + } + + //取斜杠后面的字符串,如果没有斜杠就返回字符串本身 + private String getSubName(String prjName) { + // TODO Auto-generated method stub + String SubName = prjName; + if(prjName.indexOf('/') == -1) + return SubName; + else + SubName = prjName.substring(prjName.indexOf('/') + 1); + return SubName; + } + + public static String DeleteSpecial(String subName) { + // TODO Auto-generated method stub + int deleteNum=0; + StringBuffer str=new StringBuffer(subName); + for(int i=0;i< subName.length();i++) + { + //去除下划线 + if(subName.charAt(i) == '_') + { + str.deleteCharAt(i-deleteNum); + deleteNum++; + + } + //去除短横线 + if(subName.charAt(i) == '-') + { + //System.out.println("字符位置在" + i + " and 字符是" + subName.charAt(i)); + str.deleteCharAt(i-deleteNum); + deleteNum++; + } + if(subName.charAt(i) == '.') + { + //System.out.println("字符位置在" + i + " and 字符是" + subName.charAt(i)); + str.deleteCharAt(i-deleteNum); + deleteNum++; + } + if(subName.charAt(i)== '/') + { + //System.out.println("字符位置在" + i + " and 字符是" + subName.charAt(i)); + str.deleteCharAt(i-deleteNum); + deleteNum++; + } + } + + return str.toString(); + + } + + + public static String ReplaceSpecial(String subName) { + // TODO Auto-generated method stub + StringBuffer str=new StringBuffer(subName); + for(int i=0;iafterExtract(List synonymsList,ListsynonymsList_not_sure) + { + Map SynonymMap = new HashMap(); + if(synonymsList != null) + { + synonymsList = removeDupStr(synonymsList); + if(synonymsList != null) + for(String sure:synonymsList) + { + SynonymMap.put(sure, 1); + } + } + + if(synonymsList_not_sure != null) + { + synonymsList_not_sure = removeDupStr(synonymsList_not_sure); + for(String Notsure:synonymsList_not_sure) + { + SynonymMap.put(Notsure, 0); + } + } + + return SynonymMap; + } + + public String getSynonymByFullOrShortOfGitPrj(String subName,String subName2,String descWithoutBracketHandled) + { + if(StringHandler.isFullOrShort(ReplaceSpecial(descWithoutBracketHandled).toLowerCase(), subName.toLowerCase()) || + StringHandler.specialIsFullOrShortForGitPrj(ReplaceSpecial(descWithoutBracketHandled).toLowerCase(), subName.toLowerCase())) + { + return descWithoutBracketHandled; + //System.out.println("ReplaceSpecial提取到别名:" + descWithoutBracketHandled); + } +// if(StringHandler.isFullOrShort(DeleteSpecial(descWithoutBracketHandled).toLowerCase(), subName2.toLowerCase()) || +// StringHandler.specialIsFullOrShortForGitPrj(DeleteSpecial(descWithoutBracketHandled).toLowerCase(), subName2.toLowerCase())) +// { +// return descWithoutBracketHandled; +// //System.out.println("DeleteSpecial提取到别名:" + descWithoutBracketHandled); +// } + return null; + } + public static String RemoveShortDash(String name) { + String withoutShortDash = ""; + if (name.indexOf('-') > -1) + withoutShortDash = name.substring(0, name.indexOf('-')).trim(); + return withoutShortDash; + } + + public List removeDupStr(List array) { + List list = new ArrayList(); + Set set = new HashSet(); + for (String s : array) { + if (set.add(s.toLowerCase())) { + list.add(s); + } + } + return list; + } + + public static double SimilarDegree(String strA, String strB) { + + String newStrA = removeSign(strA); + String newStrB = removeSign(strB); + int temp = Math.max(newStrA.length(), newStrB.length()); + int temp2 = longestCommonSubstring(newStrA, newStrB).length(); + return temp2 * 1.0 / temp; + + } + + private static String removeSign(String str) { + StringBuffer sb = new StringBuffer(); + for (char item : str.toCharArray()) + if (charReg(item)) { + sb.append(item); + } + return sb.toString(); + + } + + private static boolean charReg(char charValue) { + + return (charValue >= 0x4E00 && charValue <= 0X9FA5) + || (charValue >= 'a' && charValue <= 'z') + || (charValue >= 'A' && charValue <= 'Z') + || (charValue >= '0' && charValue <= '9'); + + } + + private static String longestCommonSubstring(String strA, String strB) { + char[] chars_strA = strA.toCharArray(); + char[] chars_strB = strB.toCharArray(); + int m = chars_strA.length; + int n = chars_strB.length; + int[][] matrix = new int[m + 1][n + 1]; + for (int i = 1; i <= m; i++) { + + for (int j = 1; j <= n; j++) { + if (chars_strA[i - 1] == chars_strB[j - 1]) + matrix[i][j] = matrix[i - 1][j - 1] + 1; + else + matrix[i][j] = Math.max(matrix[i][j - 1], matrix[i - 1][j]); + } + + } + + char[] result = new char[matrix[m][n]]; + int currentIndex = result.length - 1; + while (matrix[m][n] != 0) { + if (matrix[m][n] == matrix[m][n - 1]) + n--; + else if (matrix[m][n] == matrix[m - 1][n]) + m--; + else { + result[currentIndex] = chars_strA[m - 1]; + currentIndex--; + n--; + m--; + } + } + return new String(result); + + } +} diff --git a/project_match/src/main/java/com/ossean/dao/GatherDao.java b/project_match/src/main/java/com/ossean/dao/GatherDao.java index b6d711f..41fd996 100644 --- a/project_match/src/main/java/com/ossean/dao/GatherDao.java +++ b/project_match/src/main/java/com/ossean/dao/GatherDao.java @@ -13,6 +13,12 @@ import com.ossean.model.OpenhubUrl; import com.ossean.model.GatherProjectsModel; public interface GatherDao { + @Select("select * from ${sourceTable} where id >= #{beginId} and id <= #{endId} and update_mark = 0;") + public List selectbatchPrj(@Param("beginId")int beginId, @Param("endId")int endId,@Param("sourceTable") String sourceTable); + @Select("select max(id) from ${table}") + public int getMaxId(@Param("table") String table); + @Select("select min(id) from ${gatherProjectsTableName} where update_mark = 0") + public int selectLastId(@Param("gatherProjectsTableName")String gatherProjectsTableName); //tag表Insert操作 @Insert("insert ignore into ${table} (name) values (#{name})")