gather_program bugs

This commit is contained in:
zhanyun 2016-01-26 10:56:15 +08:00
parent 91d339344e
commit a0d3875a7c
9 changed files with 31 additions and 47 deletions

View File

@ -5,9 +5,9 @@
<entry key="pointerTableName">pointers</entry>
<entry key="sourceTableName">oschina_project,openhub_project,sourceforge_project,apache,freecode_project</entry>
<entry key="targetTableName">gather_projects</entry>
<entry key="sourceFields">id,name,tags,url,url_md5,description,language,source,license,homepage,now(),extracted_time,category,created_time</entry>
<entry key="sourceFields">id,name,tags,url,url_md5,description,language,source,license,homepage,now(),extracted_time,created_time</entry>
<!-- <entry key="targetFields">url,crawled_time,tags,license,name,description,language,platform,source,registered_time,urlMD5</entry> -->
<entry key="targetFields">id,name,tags,url,url_md5,description,language,source,license,homepage,updated_time,extracted_time,category,created_time</entry>
<entry key="targetFields">id,name,tags,url,url_md5,description,language,source,license,homepage,updated_time,extracted_time,created_time</entry>
<entry key="waitDataTime">3600000</entry>
<entry key="andWhere"> </entry>
<entry key="idsBegin">1</entry>

View File

@ -5,9 +5,9 @@
<entry key="pointerTableName">pointers</entry>
<entry key="sourceTableName">job_requirements</entry>
<entry key="targetTableName">job_requirements</entry>
<entry key="sourceFields">id,title,content,created_time,type,tags,url,url_md5,author,author_url,category,view_num,review_num,extracted_time,source,similar_position,work_place,experience,salary,scale,domain,finance,style,education</entry>
<entry key="sourceFields">id,title,content,created_time,type,tags,url,url_md5,author,author_url,view_num,review_num,extracted_time,source,similar_position,work_place,experience,salary,scale,domain,finance,style,education</entry>
<!-- <entry key="targetFields">url,crawled_time,tags,license,name,description,language,platform,source,registered_time,urlMD5</entry> -->
<entry key="targetFields">id,title,content,created_time,type,tags,url,url_md5,author,author_url,category,view_num,review_num,extracted_time,source,similar_position,work_place,experience,salary,scale,domain,finance,style,education</entry>
<entry key="targetFields">id,title,content,created_time,type,tags,url,url_md5,author,author_url,view_num,review_num,extracted_time,source,similar_position,work_place,experience,salary,scale,domain,finance,style,education</entry>
<entry key="waitDataTime">3600000</entry>
<entry key="andWhere"> </entry>
<entry key="idsBegin">1</entry>

View File

@ -17,7 +17,6 @@ public class GatherProject {
private String homepage;
private String updated_time;
private String extracted_time;
private String category;
private int composite_score;
private int relative_memos_num;
private String created_time;
@ -111,12 +110,6 @@ public class GatherProject {
public void setExtracted_time(String extracted_time) {
this.extracted_time = extracted_time;
}
public String getCategory() {
return category;
}
public void setCategory(String category) {
this.category = category;
}
public int getComposite_score() {
return composite_score;
}

View File

@ -11,7 +11,6 @@ public class JobRequirement {
private String url_md5 = null;
private String author = null;
private String author_url = null;
private String category = null;
private int view_num = 0;
private int review_num = 0;
private String extracted_time = null;
@ -106,14 +105,6 @@ public class JobRequirement {
this.author_url = author_url;
}
public String getCategory() {
return category;
}
public void setCategory(String category) {
this.category = category;
}
public int getView_num() {
return view_num;
}

View File

@ -134,7 +134,7 @@ public class GatherThread implements Runnable {
String urlMD5 = model.getUrl_md5();// 通过urlMD5判断是不是已经存在该帖子
// 是否更新
int postId = 0;
RelativeMemo samePost = gatherDao.findPostByUrlMD5(
RelativeMemo samePost = targetDao.findPostByUrlMD5(
conf.getTargetTableName(), urlMD5);
if (samePost == null) {
// 不存在 插入
@ -199,10 +199,10 @@ public class GatherThread implements Runnable {
conf.getAndWhere());
for (int i = 0; i < dataGet.size(); i++) {
GatherProject model = dataGet.get(i);
String urlMD5 = model.getUrl_md5();// 通过urlMD5判断是不是已经存在该帖子
String urlMD5 = model.getUrl_md5();// 通过urlMD5判断是不是已经存在该项目
// 是否更新
int prjId = 0;
GatherProject samePrj = gatherDao.findPrjByUrlMD5(
GatherProject samePrj = targetDao.findPrjByUrlMD5(
conf.getTargetTableName(), urlMD5);
if (samePrj == null) {
// 不存在 插入
@ -270,7 +270,7 @@ public class GatherThread implements Runnable {
String urlMD5 = model.getUrl_md5();// 通过urlMD5判断是不是已经存在该帖子
// 是否更新
int postId = 0;
JobRequirement sameJob = gatherDao.findJobByUrlMD5(
JobRequirement sameJob = targetDao.findJobByUrlMD5(
conf.getTargetTableName(), urlMD5);
if (sameJob == null) {
// 不存在 插入

View File

@ -75,21 +75,6 @@ public interface GatherDao {
@Param("sourceTableName") String sourceTableName,
@Param("targetTableName") String targetTableName);
// 根据urlMD5码找到RelativeMemo对象
@Select("select * from ${table} where url_md5=#{urlMD5}")
public RelativeMemo findPostByUrlMD5(@Param("table") String table,
@Param("urlMD5") String urlMD5);
// 根据urlMD5码找到OpenSourceProject对象
@Select("select * from ${table} where url_md5=#{urlMD5}")
public GatherProject findPrjByUrlMD5(@Param("table") String table,
@Param("urlMD5") String urlMD5);
// 根据urlMD5码找到JobRequirement对象
@Select("select * from ${table} where url_md5=#{urlMD5}")
public JobRequirement findJobByUrlMD5(@Param("table") String table,
@Param("urlMD5") String urlMD5);
// 批量获得JobRequirement
@Select("SELECT * from job_requirements WHERE id > #{id} limit #{batchSize}")
public List<JobRequirement> getJobRequirementList(@Param("id") int id,

View File

@ -64,7 +64,7 @@ public interface TargetDao {
// 将获取到得数据源数据插入目标表中
@Insert("INSERT INTO ${targetTable} (${targetFields}) VALUES (#{model.id},#{model.name},#{model.tags},#{model.url},#{model.url_md5},"
+ "#{model.description},#{model.language},#{model.source},#{model.license},#{model.homepage},#{model.updated_time},#{model.extracted_time},#{model.category},#{model.created_time})")
+ "#{model.description},#{model.language},#{model.source},#{model.license},#{model.homepage},#{model.updated_time},#{model.extracted_time},#{model.created_time})")
public int insertOpenSourceProject(
@Param("targetTable") String targetTableName,
@Param("targetFields") String targetFields,
@ -73,14 +73,14 @@ public interface TargetDao {
// 对urlMD5相同的数据进行update操作
@Update("update ${targetTable} set id=#{model.id},name=#{model.name},tags=#{model.tags},url=#{model.url},url_md5=#{model.url_md5},"
+ "description=#{model.description},language=#{model.language},source=#{model.source},license=#{model.license},homepage=#{model.homepage},updated_time=#{model.updated_time},extracted_time=#{model.extracted_time},"
+ "category=#{model.category},created_time=#{model.created_time} where id=#{id}")
+ "created_time=#{model.created_time} where id=#{id}")
public void updateOpenSourceProject(
@Param("targetTable") String targetTableName,
@Param("model") GatherProject model, @Param("id") int id);
// 将获取到得数据源数据插入目标表中
@Insert("INSERT INTO ${targetTable} (${targetFields}) VALUES (#{model.id},#{model.title},#{model.content},#{model.created_time},#{model.type},"
+ "#{model.tags},#{model.url},#{model.url_md5},#{model.author},#{model.author_url},#{model.category},#{model.view_num},#{model.review_num},#{model.extracted_time}"
+ "#{model.tags},#{model.url},#{model.url_md5},#{model.author},#{model.author_url},#{model.view_num},#{model.review_num},#{model.extracted_time}"
+ ",#{model.source},#{model.similar_position},#{model.work_place},#{model.experience},#{model.salary},#{model.scale},#{model.domain}"
+ ",#{model.finance},#{model.style},#{model.education})")
public int insertJobRequirement(
@ -90,7 +90,7 @@ public interface TargetDao {
// 对urlMD5相同的数据进行update操作
@Update("update ${targetTable} set id=#{model.id},title=#{model.title},content=#{model.content},created_time=#{model.created_time},type=#{model.type},"
+ "tags=#{model.tags},url=#{model.url},url_md5=#{model.url_md5},author=#{model.author},author_url=#{model.author_url},category=#{model.category},view_num=#{model.view_num},"
+ "tags=#{model.tags},url=#{model.url},url_md5=#{model.url_md5},author=#{model.author},author_url=#{model.author_url},view_num=#{model.view_num},"
+ "review_num=#{model.review_num},extracted_time=#{model.extracted_time},source=#{model.source},similar_position=#{model.similar_position}"
+ ",work_place=#{model.work_place},experience=#{model.experience},salary=#{model.salary},scale=#{model.scale},domain=#{model.domain}"
+ ",finance=#{model.finance},style=#{model.style},education=#{model.education} where id=#{id}")
@ -103,4 +103,19 @@ public interface TargetDao {
public int getAutoIncrementOspId(@Param("table") String table,
@Param("model") GatherProject model);
// 根据urlMD5码找到RelativeMemo对象
@Select("select * from ${table} where url_md5=#{urlMD5}")
public RelativeMemo findPostByUrlMD5(@Param("table") String table,
@Param("urlMD5") String urlMD5);
// 根据urlMD5码找到OpenSourceProject对象
@Select("select * from ${table} where url_md5=#{urlMD5}")
public GatherProject findPrjByUrlMD5(@Param("table") String table,
@Param("urlMD5") String urlMD5);
// 根据urlMD5码找到JobRequirement对象
@Select("select * from ${table} where url_md5=#{urlMD5}")
public JobRequirement findJobByUrlMD5(@Param("table") String table,
@Param("urlMD5") String urlMD5);
}

View File

@ -5,9 +5,9 @@
<entry key="pointerTableName">pointers</entry>
<entry key="sourceTableName">oschina_project,openhub_project</entry>
<entry key="targetTableName">gather_projects</entry>
<entry key="sourceFields">id,name,tags,url,url_md5,description,language,source,license,homepage,now(),extracted_time,category,created_time</entry>
<entry key="sourceFields">id,name,tags,url,url_md5,description,language,source,license,homepage,now(),extracted_time,created_time</entry>
<!-- <entry key="targetFields">url,crawled_time,tags,license,name,description,language,platform,source,registered_time,urlMD5</entry> -->
<entry key="targetFields">id,name,tags,url,url_md5,description,language,source,license,homepage,updated_time,extracted_time,category,created_time</entry>
<entry key="targetFields">id,name,tags,url,url_md5,description,language,source,license,homepage,updated_time,extracted_time,created_time</entry>
<entry key="waitDataTime">3600000</entry>
<entry key="andWhere"> </entry>
<entry key="idsBegin">1</entry>

View File

@ -5,9 +5,9 @@
<entry key="pointerTableName">pointers</entry>
<entry key="sourceTableName">job_requirements</entry>
<entry key="targetTableName">job_requirements_new</entry>
<entry key="sourceFields">id,title,content,created_time,type,tags,url,url_md5,author,author_url,category,view_num,review_num,extracted_time,source,similar_position,work_place,experience,salary,scale,domain,finance,style,education</entry>
<entry key="sourceFields">id,title,content,created_time,type,tags,url,url_md5,author,author_url,view_num,review_num,extracted_time,source,similar_position,work_place,experience,salary,scale,domain,finance,style,education</entry>
<!-- <entry key="targetFields">url,crawled_time,tags,license,name,description,language,platform,source,registered_time,urlMD5</entry> -->
<entry key="targetFields">id,title,content,created_time,type,tags,url,url_md5,author,author_url,category,view_num,review_num,extracted_time,source,similar_position,work_place,experience,salary,scale,domain,finance,style,education</entry>
<entry key="targetFields">id,title,content,created_time,type,tags,url,url_md5,author,author_url,view_num,review_num,extracted_time,source,similar_position,work_place,experience,salary,scale,domain,finance,style,education</entry>
<entry key="waitDataTime">3600000</entry>
<entry key="andWhere"> </entry>
<entry key="idsBegin">1</entry>