gather_posts

This commit is contained in:
zhanyun 2015-12-08 11:19:50 +08:00
parent e5c17cb0ef
commit f7ac1a3cb2
54 changed files with 1960 additions and 2 deletions

26
gather_posts/.classpath Normal file
View File

@ -0,0 +1,26 @@
<?xml version="1.0" encoding="UTF-8"?>
<classpath>
<classpathentry kind="src" output="target/classes" path="src/main/java">
<attributes>
<attribute name="optional" value="true"/>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="src" output="target/test-classes" path="src/test/java">
<attributes>
<attribute name="optional" value="true"/>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/J2SE-1.5">
<attributes>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="con" path="org.eclipse.m2e.MAVEN2_CLASSPATH_CONTAINER">
<attributes>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="output" path="target/classes"/>
</classpath>

1
gather_posts/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
/target/

23
gather_posts/.project Normal file
View File

@ -0,0 +1,23 @@
<?xml version="1.0" encoding="UTF-8"?>
<projectDescription>
<name>gather_posts</name>
<comment></comment>
<projects>
</projects>
<buildSpec>
<buildCommand>
<name>org.eclipse.jdt.core.javabuilder</name>
<arguments>
</arguments>
</buildCommand>
<buildCommand>
<name>org.eclipse.m2e.core.maven2Builder</name>
<arguments>
</arguments>
</buildCommand>
</buildSpec>
<natures>
<nature>org.eclipse.jdt.core.javanature</nature>
<nature>org.eclipse.m2e.core.maven2Nature</nature>
</natures>
</projectDescription>

View File

@ -0,0 +1,4 @@
eclipse.preferences.version=1
encoding//src/main/java=UTF-8
encoding//src/test/java=UTF-8
encoding/<project>=UTF-8

View File

@ -0,0 +1,5 @@
eclipse.preferences.version=1
org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.5
org.eclipse.jdt.core.compiler.compliance=1.5
org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
org.eclipse.jdt.core.compiler.source=1.5

View File

@ -0,0 +1,4 @@
activeProfiles=
eclipse.preferences.version=1
resolveWorkspaceProjects=true
version=1

View File

@ -0,0 +1,23 @@
#!/bin/bash
task=51cto_blog_to_t_knowledge
find ./target/classes -name "*.properties"|xargs rm -f
find ./target/classes -name "*.xml"|xargs rm -f
find ./target/classes -name "*.dic"|xargs rm -f
find ./target/classes/spring |xargs rm -f -r
#export CLASSPATH=$CURR_DIR/lib:$CURR_DIR:$JAVA_HOME/lib:$JAVA_HOME/jre/lib
tmp='./bin/resources'
tmp='./target/classes':$tmp
tmp='./target/tFlow-1.0-SNAPSHOT-jar-with-dependencies-without-resources/*':$tmp
CLASSPATH=$tmp:$CLASSPATH
echo $CLASSPATH
JVM_ARGS="-Xmn48m -Xmx128m -Xms128m -XX:NewRatio=4 -XX:SurvivorRatio=4 -XX:MaxTenuringThreshold=2"
#echo JVM_ARGS=$JVM_ARGS
#ulimit -n 400000
#echo "" > nohup.out
java $JVM_ARGS -classpath $CLASSPATH net.trustie.db.TableFlow $task >>log/${task}.log 2>&1 &

View File

@ -0,0 +1,23 @@
#!/bin/bash
task=cnblog_news_to_t_knowledge
find ./target/classes -name "*.properties"|xargs rm -f
find ./target/classes -name "*.xml"|xargs rm -f
find ./target/classes -name "*.dic"|xargs rm -f
find ./target/classes/spring |xargs rm -f -r
#export CLASSPATH=$CURR_DIR/lib:$CURR_DIR:$JAVA_HOME/lib:$JAVA_HOME/jre/lib
tmp='./bin/resources'
tmp='./target/classes':$tmp
tmp='./target/tFlow-1.0-SNAPSHOT-jar-with-dependencies-without-resources/*':$tmp
CLASSPATH=$tmp:$CLASSPATH
echo $CLASSPATH
JVM_ARGS="-Xmn48m -Xmx128m -Xms128m -XX:NewRatio=4 -XX:SurvivorRatio=4 -XX:MaxTenuringThreshold=2"
#echo JVM_ARGS=$JVM_ARGS
#ulimit -n 400000
#echo "" > nohup.out
java $JVM_ARGS -classpath $CLASSPATH net.trustie.db.TableFlow $task >>log/${task}.log 2>&1 &

View File

@ -0,0 +1,24 @@
#!/bin/bash
task=cnblog_question_to_t_knowledge
find ./target/classes -name "*.properties"|xargs rm -f
find ./target/classes -name "*.xml"|xargs rm -f
find ./target/classes -name "*.dic"|xargs rm -f
find ./target/classes/spring |xargs rm -f -r
#export CLASSPATH=$CURR_DIR/lib:$CURR_DIR:$JAVA_HOME/lib:$JAVA_HOME/jre/lib
tmp='./bin/resources'
tmp='./target/classes':$tmp
tmp='./target/tFlow-1.0-SNAPSHOT-jar-with-dependencies-without-resources/*':$tmp
CLASSPATH=$tmp:$CLASSPATH
echo $CLASSPATH
JVM_ARGS="-Xmn48m -Xmx128m -Xms128m -XX:NewRatio=4 -XX:SurvivorRatio=4 -XX:MaxTenuringThreshold=2"
#echo JVM_ARGS=$JVM_ARGS
#ulimit -n 400000
#echo "" > nohup.out
java $JVM_ARGS -classpath $CLASSPATH net.trustie.db.TableFlow $task >>log/${task}.log 2>&1 &

View File

@ -0,0 +1,24 @@
#!/bin/bash
task=csdn_ask_to_t_knowledge
find ./target/classes -name "*.properties"|xargs rm -f
find ./target/classes -name "*.xml"|xargs rm -f
find ./target/classes -name "*.dic"|xargs rm -f
find ./target/classes/spring |xargs rm -f -r
#export CLASSPATH=$CURR_DIR/lib:$CURR_DIR:$JAVA_HOME/lib:$JAVA_HOME/jre/lib
tmp='./bin/resources'
tmp='./target/classes':$tmp
tmp='./target/tFlow-1.0-SNAPSHOT-jar-with-dependencies-without-resources/*':$tmp
CLASSPATH=$tmp:$CLASSPATH
echo $CLASSPATH
JVM_ARGS="-Xmn48m -Xmx128m -Xms128m -XX:NewRatio=4 -XX:SurvivorRatio=4 -XX:MaxTenuringThreshold=2"
#echo JVM_ARGS=$JVM_ARGS
#ulimit -n 400000
#echo "" > nohup.out
java $JVM_ARGS -classpath $CLASSPATH net.trustie.db.TableFlow $task >>log/${task}.log 2>&1 &

View File

@ -0,0 +1,24 @@
#!/bin/bash
task=csdn_blogs_to_t_knowledge
find ./target/classes -name "*.properties"|xargs rm -f
find ./target/classes -name "*.xml"|xargs rm -f
find ./target/classes -name "*.dic"|xargs rm -f
find ./target/classes/spring |xargs rm -f -r
#export CLASSPATH=$CURR_DIR/lib:$CURR_DIR:$JAVA_HOME/lib:$JAVA_HOME/jre/lib
tmp='./bin/resources'
tmp='./target/classes':$tmp
tmp='./target/tFlow-1.0-SNAPSHOT-jar-with-dependencies-without-resources/*':$tmp
CLASSPATH=$tmp:$CLASSPATH
echo $CLASSPATH
JVM_ARGS="-Xmn48m -Xmx128m -Xms128m -XX:NewRatio=4 -XX:SurvivorRatio=4 -XX:MaxTenuringThreshold=2"
#echo JVM_ARGS=$JVM_ARGS
#ulimit -n 400000
#echo "" > nohup.out
java $JVM_ARGS -classpath $CLASSPATH net.trustie.db.TableFlow $task >>log/${task}.log 2>&1 &

View File

@ -0,0 +1,24 @@
#!/bin/bash
task=csdn_topics_to_t_knowledge
find ./target/classes -name "*.properties"|xargs rm -f
find ./target/classes -name "*.xml"|xargs rm -f
find ./target/classes -name "*.dic"|xargs rm -f
find ./target/classes/spring |xargs rm -f -r
#export CLASSPATH=$CURR_DIR/lib:$CURR_DIR:$JAVA_HOME/lib:$JAVA_HOME/jre/lib
tmp='./bin/resources'
tmp='./target/classes':$tmp
tmp='./target/tFlow-1.0-SNAPSHOT-jar-with-dependencies-without-resources/*':$tmp
CLASSPATH=$tmp:$CLASSPATH
echo $CLASSPATH
JVM_ARGS="-Xmn48m -Xmx128m -Xms128m -XX:NewRatio=4 -XX:SurvivorRatio=4 -XX:MaxTenuringThreshold=2"
#echo JVM_ARGS=$JVM_ARGS
#ulimit -n 400000
#echo "" > nohup.out
java $JVM_ARGS -classpath $CLASSPATH net.trustie.db.TableFlow $task >>log/${task}.log 2>&1 &

View File

@ -0,0 +1,24 @@
#!/bin/bash
task=dewen_question_to_t_knowledge
find ./target/classes -name "*.properties"|xargs rm -f
find ./target/classes -name "*.xml"|xargs rm -f
find ./target/classes -name "*.dic"|xargs rm -f
find ./target/classes/spring |xargs rm -f -r
#export CLASSPATH=$CURR_DIR/lib:$CURR_DIR:$JAVA_HOME/lib:$JAVA_HOME/jre/lib
tmp='./bin/resources'
tmp='./target/classes':$tmp
tmp='./target/tFlow-1.0-SNAPSHOT-jar-with-dependencies-without-resources/*':$tmp
CLASSPATH=$tmp:$CLASSPATH
echo $CLASSPATH
JVM_ARGS="-Xmn48m -Xmx128m -Xms128m -XX:NewRatio=4 -XX:SurvivorRatio=4 -XX:MaxTenuringThreshold=2"
#echo JVM_ARGS=$JVM_ARGS
#ulimit -n 400000
#echo "" > nohup.out
java $JVM_ARGS -classpath $CLASSPATH net.trustie.db.TableFlow $task >>log/${task}.log 2>&1 &

View File

@ -0,0 +1,24 @@
#!/bin/bash
task=freecode_projects_to_t_project
find ./target/classes -name "*.properties"|xargs rm -f
find ./target/classes -name "*.xml"|xargs rm -f
find ./target/classes -name "*.dic"|xargs rm -f
find ./target/classes/spring |xargs rm -f -r
#export CLASSPATH=$CURR_DIR/lib:$CURR_DIR:$JAVA_HOME/lib:$JAVA_HOME/jre/lib
tmp='./bin/resources'
tmp='./target/classes':$tmp
tmp='./target/tFlow-1.0-SNAPSHOT-jar-with-dependencies-without-resources/*':$tmp
CLASSPATH=$tmp:$CLASSPATH
echo $CLASSPATH
JVM_ARGS="-Xmn48m -Xmx128m -Xms128m -XX:NewRatio=4 -XX:SurvivorRatio=4 -XX:MaxTenuringThreshold=2"
#echo JVM_ARGS=$JVM_ARGS
#ulimit -n 400000
#echo "" > nohup.out
java $JVM_ARGS -classpath $CLASSPATH net.trustie.db.TableFlow $task >>log/${task}.log 2>&1 &

View File

@ -0,0 +1,24 @@
#!/bin/bash
task=iteye_ask_to_t_knowledge
find ./target/classes -name "*.properties"|xargs rm -f
find ./target/classes -name "*.xml"|xargs rm -f
find ./target/classes -name "*.dic"|xargs rm -f
find ./target/classes/spring |xargs rm -f -r
#export CLASSPATH=$CURR_DIR/lib:$CURR_DIR:$JAVA_HOME/lib:$JAVA_HOME/jre/lib
tmp='./bin/resources'
tmp='./target/classes':$tmp
tmp='./target/tFlow-1.0-SNAPSHOT-jar-with-dependencies-without-resources/*':$tmp
CLASSPATH=$tmp:$CLASSPATH
echo $CLASSPATH
JVM_ARGS="-Xmn48m -Xmx128m -Xms128m -XX:NewRatio=4 -XX:SurvivorRatio=4 -XX:MaxTenuringThreshold=2"
#echo JVM_ARGS=$JVM_ARGS
#ulimit -n 400000
#echo "" > nohup.out
java $JVM_ARGS -classpath $CLASSPATH net.trustie.db.TableFlow $task >>log/${task}.log 2>&1 &

View File

@ -0,0 +1,23 @@
#!/bin/bash
task=linuxtone_posts_to_t_knowledge
find ./target/classes -name "*.properties"|xargs rm -f
find ./target/classes -name "*.xml"|xargs rm -f
find ./target/classes -name "*.dic"|xargs rm -f
find ./target/classes/spring |xargs rm -f -r
#export CLASSPATH=$CURR_DIR/lib:$CURR_DIR:$JAVA_HOME/lib:$JAVA_HOME/jre/lib
tmp='./bin/resources'
tmp='./target/classes':$tmp
tmp='./target/tFlow-1.0-SNAPSHOT-jar-with-dependencies-without-resources/*':$tmp
CLASSPATH=$tmp:$CLASSPATH
echo $CLASSPATH
JVM_ARGS="-Xmn48m -Xmx128m -Xms128m -XX:NewRatio=4 -XX:SurvivorRatio=4 -XX:MaxTenuringThreshold=2"
#echo JVM_ARGS=$JVM_ARGS
#ulimit -n 400000
#echo "" > nohup.out
java $JVM_ARGS -classpath $CLASSPATH net.trustie.db.TableFlow $task >>log/${task}.log 2>&1 &

View File

@ -0,0 +1,24 @@
#!/bin/bash
task=openhub_project_to_t_project
find ./target/classes -name "*.properties"|xargs rm -f
find ./target/classes -name "*.xml"|xargs rm -f
find ./target/classes -name "*.dic"|xargs rm -f
find ./target/classes/spring |xargs rm -f -r
#export CLASSPATH=$CURR_DIR/lib:$CURR_DIR:$JAVA_HOME/lib:$JAVA_HOME/jre/lib
tmp='./bin/resources'
tmp='./target/classes':$tmp
tmp='./target/tFlow-1.0-SNAPSHOT-jar-with-dependencies-without-resources/*':$tmp
CLASSPATH=$tmp:$CLASSPATH
echo $CLASSPATH
JVM_ARGS="-Xmn48m -Xmx128m -Xms128m -XX:NewRatio=4 -XX:SurvivorRatio=4 -XX:MaxTenuringThreshold=2"
#echo JVM_ARGS=$JVM_ARGS
#ulimit -n 400000
#echo "" > nohup.out
java $JVM_ARGS -classpath $CLASSPATH net.trustie.db.TableFlow $task >>log/${task}.log 2>&1 &

View File

@ -0,0 +1,24 @@
#!/bin/bash
task=oschina_project_to_t_project
find ./target/classes -name "*.properties"|xargs rm -f
find ./target/classes -name "*.xml"|xargs rm -f
find ./target/classes -name "*.dic"|xargs rm -f
find ./target/classes/spring |xargs rm -f -r
#export CLASSPATH=$CURR_DIR/lib:$CURR_DIR:$JAVA_HOME/lib:$JAVA_HOME/jre/lib
tmp='./bin/resources'
tmp='./target/classes':$tmp
tmp='./target/tFlow-1.0-SNAPSHOT-jar-with-dependencies-without-resources/*':$tmp
CLASSPATH=$tmp:$CLASSPATH
echo $CLASSPATH
JVM_ARGS="-Xmn48m -Xmx128m -Xms128m -XX:NewRatio=4 -XX:SurvivorRatio=4 -XX:MaxTenuringThreshold=2"
#echo JVM_ARGS=$JVM_ARGS
#ulimit -n 400000
#echo "" > nohup.out
java $JVM_ARGS -classpath $CLASSPATH net.trustie.db.TableFlow $task >>log/${task}.log 2>&1 &

View File

@ -0,0 +1,24 @@
#!/bin/bash
task=oschina_question_to_t_knowledge
find ./target/classes -name "*.properties"|xargs rm -f
find ./target/classes -name "*.xml"|xargs rm -f
find ./target/classes -name "*.dic"|xargs rm -f
find ./target/classes/spring |xargs rm -f -r
#export CLASSPATH=$CURR_DIR/lib:$CURR_DIR:$JAVA_HOME/lib:$JAVA_HOME/jre/lib
tmp='./bin/resources'
tmp='./target/classes':$tmp
tmp='./target/tFlow-1.0-SNAPSHOT-jar-with-dependencies-without-resources/*':$tmp
CLASSPATH=$tmp:$CLASSPATH
echo $CLASSPATH
JVM_ARGS="-Xmn48m -Xmx128m -Xms128m -XX:NewRatio=4 -XX:SurvivorRatio=4 -XX:MaxTenuringThreshold=2"
#echo JVM_ARGS=$JVM_ARGS
#ulimit -n 400000
#echo "" > nohup.out
java $JVM_ARGS -classpath $CLASSPATH net.trustie.db.TableFlow $task >>log/${task}.log 2>&1 &

View File

@ -0,0 +1,23 @@
#!/bin/bash
task=phpchina_posts_to_t_knowledge
find ./target/classes -name "*.properties"|xargs rm -f
find ./target/classes -name "*.xml"|xargs rm -f
find ./target/classes -name "*.dic"|xargs rm -f
find ./target/classes/spring |xargs rm -f -r
#export CLASSPATH=$CURR_DIR/lib:$CURR_DIR:$JAVA_HOME/lib:$JAVA_HOME/jre/lib
tmp='./bin/resources'
tmp='./target/classes':$tmp
tmp='./target/tFlow-1.0-SNAPSHOT-jar-with-dependencies-without-resources/*':$tmp
CLASSPATH=$tmp:$CLASSPATH
echo $CLASSPATH
JVM_ARGS="-Xmn48m -Xmx128m -Xms128m -XX:NewRatio=4 -XX:SurvivorRatio=4 -XX:MaxTenuringThreshold=2"
#echo JVM_ARGS=$JVM_ARGS
#ulimit -n 400000
#echo "" > nohup.out
java $JVM_ARGS -classpath $CLASSPATH net.trustie.db.TableFlow $task >>log/${task}.log 2>&1 &

View File

@ -0,0 +1,20 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">
<properties>
<comment>TableFlow</comment>
<entry key="pointerTableName">pointers</entry>
<entry key="migrationTask">migrationTask</entry>
<entry key="sourceTableName">51cto_blog</entry>
<entry key="targetTableName">t_knowledge</entry>
<entry key="sourceFields">deal_title,author,content,tags,reply_num,NULL,vote_up,NULL,extractTime,Url,created_time,"51cto_blog",author_url,NULL,"blogs",MD5(Url)</entry>
<entry key="targetFields">title,author,content,tags,reply_num,view_num,vote_num,collection_num,crawled_time,url,created_time,source,author_url,last_time,type,url_md5</entry>
<entry key="waitDataTime">10000</entry>
<entry key="andWhere"></entry>
<!--需要转移开始Id值-->
<entry key="idsBegin">1</entry>
<!--需要转移结束Id值-->
<entry key="idsEnd">500000</entry>
<!--每次转移的Id量-->
<entry key="idsIncrement">500</entry>
</properties>

View File

@ -0,0 +1,20 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">
<properties>
<comment>TableFlow</comment>
<entry key="pointerTableName">pointers</entry>
<entry key="migrationTask">migrationTask</entry>
<entry key="sourceTableName">cnblog_news</entry>
<entry key="targetTableName">t_knowledge</entry>
<entry key="sourceFields">deal_title,author,content,tags,reply_num,view_num,NULL,collection_num,extractTime,url,created_time,"cnblogs",author_url,last_time,"news", MD5(Url)</entry>
<entry key="targetFields">title,author,content,tags,reply_num,view_num,vote_num,collection_num,crawled_time,url,created_time,source,author_url,last_time,type,url_md5</entry>
<entry key="waitDataTime">10000</entry>
<entry key="andWhere"></entry>
<!--需要转移开始Id值-->
<entry key="idsBegin">1</entry>
<!--需要转移结束Id值-->
<entry key="idsEnd">500000</entry>
<!--每次转移的Id量-->
<entry key="idsIncrement">500</entry>
</properties>

View File

@ -0,0 +1,20 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">
<properties>
<comment>TableFlow</comment>
<entry key="pointerTableName">pointers</entry>
<entry key="migrationTask">migrationTask</entry>
<entry key="sourceTableName">cnblog_question</entry>
<entry key="targetTableName">t_knowledge</entry>
<entry key="sourceFields">deal_title,author,content,tags,reply_num,view_num,vote_num,collection_num,extractTime,url,created_time,"cnblogs",author_url,last_time,"topic",MD5(Url)</entry>
<entry key="targetFields">title,author,content,tags,reply_num,view_num,vote_num,collection_num,crawled_time,url,created_time,source,author_url,last_time,type,url_md5</entry>
<entry key="waitDataTime">10000</entry>
<entry key="andWhere"></entry>
<!--需要转移开始Id值-->
<entry key="idsBegin">1</entry>
<!--需要转移结束Id值-->
<entry key="idsEnd">500000</entry>
<!--每次转移的Id量-->
<entry key="idsIncrement">500</entry>
</properties>

View File

@ -0,0 +1,20 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">
<properties>
<comment>TableFlow</comment>
<entry key="pointerTableName">pointers</entry>
<entry key="migrationTask">migrationTask</entry>
<entry key="sourceTableName">csdn_ask</entry>
<entry key="targetTableName">t_knowledge</entry>
<entry key="sourceFields">deal_title,author,content,tags,reply_num,view_num,vote_num,collection_num,extractTime,url,created_time,"CSDN",author_url,last_time,"topic",MD5(Url)</entry>
<entry key="targetFields">title,author,content,tags,reply_num,view_num,vote_num,collection_num,crawled_time,url,created_time,source,author_url,last_time,type,url_md5</entry>
<entry key="waitDataTime">10000</entry>
<entry key="andWhere"></entry>
<!--需要转移开始Id值-->
<entry key="idsBegin">1</entry>
<!--需要转移结束Id值-->
<entry key="idsEnd">500000</entry>
<!--每次转移的Id量-->
<entry key="idsIncrement">500</entry>
</properties>

View File

@ -0,0 +1,20 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">
<properties>
<comment>TableFlow</comment>
<entry key="pointerTableName">pointers</entry>
<entry key="migrationTask">migrationTask</entry>
<entry key="sourceTableName">csdn_blogs</entry>
<entry key="targetTableName">t_knowledge</entry>
<entry key="sourceFields">deal_title,author,content,tags,reply_num,view_num,(supportNum-opposeNum),NULL,extractTime,url,created_time,"CSDN",author_url,NULL,"blog",MD5(Url)</entry>
<entry key="targetFields">title,author,content,tags,reply_num,view_num,vote_num,collection_num,crawled_time,url,created_time,source,author_url,last_time,type,url_md5</entry>
<entry key="waitDataTime">10000</entry>
<entry key="andWhere"></entry>
<!--需要转移开始Id值-->
<entry key="idsBegin">1</entry>
<!--需要转移结束Id值-->
<entry key="idsEnd">500000</entry>
<!--每次转移的Id量-->
<entry key="idsIncrement">500</entry>
</properties>

View File

@ -0,0 +1,20 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">
<properties>
<comment>TableFlow</comment>
<entry key="pointerTableName">pointers</entry>
<entry key="migrationTask">migrationTask</entry>
<entry key="sourceTableName">csdn_topics</entry>
<entry key="targetTableName">t_knowledge</entry>
<entry key="sourceFields">deal_title,author,content,tags,replyNum,view_num,vote_num,collection_num,extractTime,url,created_time,"CSDN",author_url,NULL,"topic",MD5(Url)</entry>
<entry key="targetFields">title,author,content,tags,reply_num,view_num,vote_num,collection_num,crawled_time,url,created_time,source,author_url,last_time,type,url_md5</entry>
<entry key="waitDataTime">10000</entry>
<entry key="andWhere"></entry>
<!--需要转移开始Id值-->
<entry key="idsBegin">1</entry>
<!--需要转移结束Id值-->
<entry key="idsEnd">500000</entry>
<!--每次转移的Id量-->
<entry key="idsIncrement">500</entry>
</properties>

View File

@ -0,0 +1,20 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">
<properties>
<comment>TableFlow</comment>
<entry key="pointerTableName">pointers</entry>
<entry key="migrationTask">migrationTask</entry>
<entry key="sourceTableName">dewen_question</entry>
<entry key="targetTableName">t_knowledge</entry>
<entry key="sourceFields">deal_title,author,content,tags,(reply_num+answerNum),view_num,vote_num,collection_num,extractTime,url,created_time,"DEWEN",author_url,NULL,"topic",MD5(Url)</entry>
<entry key="targetFields">title,author,content,tags,reply_num,view_num,vote_num,collection_num,crawled_time,url,created_time,source,author_url,last_time,type,url_md5</entry>
<entry key="waitDataTime">10000</entry>
<entry key="andWhere"></entry>
<!--需要转移开始Id值-->
<entry key="idsBegin">1</entry>
<!--需要转移结束Id值-->
<entry key="idsEnd">500000</entry>
<!--每次转移的Id量-->
<entry key="idsIncrement">500</entry>
</properties>

View File

@ -0,0 +1,17 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">
<properties>
<comment>TableFlow</comment>
<entry key="pointerTableName">pointer</entry>
<entry key="migrationTask">migrationTask</entry>
<entry key="sourceTableName">src</entry>
<entry key="targetTableName">dest</entry>
<entry key="sourceFields">id,name,url</entry>
<entry key="targetFields">proj_id, proj_name, proj_url</entry>
<!--需要转移开始Id值-->
<entry key="idsBegin">1</entry>
<!--需要转移结束Id值-->
<entry key="idsEnd">500000</entry>
<!--每次转移的Id量-->
<entry key="idsIncrement">500</entry>
</properties>

View File

@ -0,0 +1,20 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">
<properties>
<comment>TableFlow</comment>
<entry key="pointerTableName">pointer</entry>
<entry key="migrationTask">migrationTask</entry>
<entry key="sourceTableName">source</entry>
<entry key="targetTableName">target</entry>
<entry key="sourceFields">sField1,sField2,sField3</entry>
<entry key="targetFields">tField1,tField2,tField3</entry>
<entry key="waitDataTime">10000</entry>
<entry key="andWhere"></entry>
<!--需要转移开始Id值-->
<entry key="idsBegin">1</entry>
<!--需要转移结束Id值-->
<entry key="idsEnd">500000</entry>
<!--每次转移的Id量-->
<entry key="idsIncrement">5000</entry>
</properties>

View File

@ -0,0 +1,16 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">
<properties>
<comment>TableFlow</comment>
<entry key="pointerTableName">pointers</entry>
<entry key="migrationTask">migrationTask</entry>
<entry key="sourceTableName">freecode_projects</entry>
<entry key="targetTableName">t_project</entry>
<entry key="sourceFields">Url,extract_time,tag,license,project_title,project_des,implementation,operate_system,"FreeCode",post_date,MD5(Url)</entry>
<entry key="targetFields">url,crawled_time,tags,license,name,description,language,platform,source,registered_time,urlMD5</entry>
<entry key="waitDataTime">3600000</entry>
<entry key="andWhere">AND tag is NOT NULL AND project_des IS NOT NULL</entry>
<entry key="idsBegin">1</entry>
<entry key="idsEnd">500000</entry>
<entry key="idsIncrement">500</entry>
</properties>

View File

@ -0,0 +1,20 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">
<properties>
<comment>TableFlow</comment>
<entry key="pointerTableName">pointers</entry>
<entry key="migrationTask">migrationTask</entry>
<entry key="sourceTableName">iteye_ask</entry>
<entry key="targetTableName">t_knowledge</entry>
<entry key="sourceFields">deal_title,author,content,tags,reply_num,view_num,(vote_num-voteDown),collection_num,extractTime,url,created_time,"ITeye",author_url,NULL,"topic",MD5(Url)</entry>
<entry key="targetFields">title,author,content,tags,reply_num,view_num,vote_num,collection_num,crawled_time,url,created_time,source,author_url,last_time,type,url_md5</entry>
<entry key="waitDataTime">10000</entry>
<entry key="andWhere"></entry>
<!--需要转移开始Id值-->
<entry key="idsBegin">1</entry>
<!--需要转移结束Id值-->
<entry key="idsEnd">500000</entry>
<!--每次转移的Id量-->
<entry key="idsIncrement">500</entry>
</properties>

View File

@ -0,0 +1,20 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">
<properties>
<comment>TableFlow</comment>
<entry key="pointerTableName">pointers</entry>
<entry key="migrationTask">migrationTask</entry>
<entry key="sourceTableName">linuxtone</entry>
<entry key="targetTableName">t_knowledge</entry>
<entry key="sourceFields">deal_title,author,content,tags,reply_num,view_num,vote_num,collection_num,extractTime,url,created_time,"linuxtone",author_url,NULL,"posts",MD5(Url)</entry>
<entry key="targetFields">title,author,content,tags,reply_num,view_num,vote_num,collection_num,crawled_time,url,created_time,source,author_url,last_time,type,url_md5</entry>
<entry key="waitDataTime">10000</entry>
<entry key="andWhere"></entry>
<!--需要转移开始Id值-->
<entry key="idsBegin">1</entry>
<!--需要转移结束Id值-->
<entry key="idsEnd">500000</entry>
<!--每次转移的Id量-->
<entry key="idsIncrement">500</entry>
</properties>

View File

@ -0,0 +1,61 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE log4j:configuration SYSTEM "log4j.dtd">
<log4j:configuration xmlns:log4j="http://jakarta.apache.org/log4j/">
<appender name="stdout" class="org.apache.log4j.ConsoleAppender">
<layout class="org.apache.log4j.PatternLayout">
<param name="threshold" value="ERROR" />
<param name="ConversionPattern" value="%d{yy-MM-dd HH:mm:ss,SSS} %-5p ## %m%n" />
</layout>
</appender>
<appender name="file" class="org.apache.log4j.DailyRollingFileAppender">
<param name="File" value="./log/error.log" />
<param name="threshold" value="ERROR" />
<layout class="org.apache.log4j.PatternLayout">
<param name="ConversionPattern" value="%d{yy-MM-dd HH:mm:ss,SSS} ## %m%n" />
</layout>
</appender>
<appender name="file_log" class="org.apache.log4j.DailyRollingFileAppender">
<param name="File" value="./log/tflow.log" />
<layout class="org.apache.log4j.PatternLayout">
<param name="ConversionPattern" value="%d{yy-MM-dd HH:mm:ss,SSS} ## %m%n" />
</layout>
</appender>
<!-- 邮件只有ERROR时才会发送 -->
<appender name="MAIL" class="org.apache.log4j.net.SMTPAppender">
<param name="threshold" value="debug" />
<!-- 日志的错误级别 <param name="threshold" value="fatal"/> -->
<!-- 缓存文件大小日志达到512K时发送Email -->
<param name="BufferSize" value="1" />
<!-- 单位K -->
<param name="From" value="ossean_debug@163.com" />
<param name="SMTPHost" value="smtp.163.com" />
<param name="Subject" value="ossean-crawler-debug-log4jMessage" />
<param name="To" value="getbox@126.com" />
<param name="SMTPUsername" value="ossean_debug" />
<param name="SMTPPassword" value="goodwell123" />
<layout class="org.apache.log4j.PatternLayout">
<param name="ConversionPattern" value="%-d{yyyy-MM-dd HH:mm:ss.SSS} [%p]-[%c] %m%n" />
</layout>
</appender>
<logger name="org.apache" additivity="false">
<level value="warn" />
<appender-ref ref="stdout" />
</logger>
<root>
<level value="info" />
<appender-ref ref="stdout" />
<appender-ref ref="file" />
<appender-ref ref="file_log" />
<!-- <appender-ref ref="MAIL" />-->
<appender-ref ref="DATABASE" />
</root>
</log4j:configuration>

View File

@ -0,0 +1,16 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">
<properties>
<comment>TableFlow</comment>
<entry key="pointerTableName">pointers</entry>
<entry key="migrationTask">migrationTask</entry>
<entry key="sourceTableName">openhub_project</entry>
<entry key="targetTableName">t_project</entry>
<entry key="sourceFields">name,description,tags,Url,licenses,contributorNum,firstCommitTime,lastCommitTime,"OpenHub",crawlerTime,MD5(Url)</entry>
<entry key="targetFields">name,description,tags,url,license,contributors_num,registered_time,last_update_time,source,crawled_time,urlMD5</entry>
<entry key="waitDataTime">3600000</entry>
<entry key="andWhere"> useCount > 0</entry>
<entry key="idsBegin">1</entry>
<entry key="idsEnd">500000</entry>
<entry key="idsIncrement">500</entry>
</properties>

View File

@ -0,0 +1,16 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">
<properties>
<comment>TableFlow</comment>
<entry key="pointerTableName">pointers</entry>
<entry key="migrationTask">migrationTask</entry>
<entry key="sourceTableName">oschina_project</entry>
<entry key="targetTableName">t_project</entry>
<entry key="sourceFields">projectShortName,Url,projectDesc,projectCategory,projectLicenses,projectLanguage,exteactTime,pageMD5,history,projectOS,"OSChina",MD5(Url)</entry>
<entry key="targetFields">name,url,description,category,license,language,crawled_time,pageMD5,history,platform,source,urlMD5</entry>
<entry key="waitDataTime">3600000</entry>
<entry key="andWhere"></entry>
<entry key="idsBegin">1</entry>
<entry key="idsEnd">500000</entry>
<entry key="idsIncrement">500</entry>
</properties>

View File

@ -0,0 +1,20 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">
<properties>
<comment>TableFlow</comment>
<entry key="pointerTableName">pointers</entry>
<entry key="migrationTask">migrationTask</entry>
<entry key="sourceTableName">oschina_question</entry>
<entry key="targetTableName">t_knowledge</entry>
<entry key="sourceFields">deal_title,author,content,tags,reply_num,view_num,vote_num,collection_num,extractTime,url,created_time,"OSChina",author_url,NULL,"topic",MD5(Url)</entry>
<entry key="targetFields">title,author,content,tags,reply_num,view_num,vote_num,collection_num,crawled_time,url,created_time,source,author_url,last_time,type,url_md5</entry>
<entry key="waitDataTime">10000</entry>
<entry key="andWhere"></entry>
<!--需要转移开始Id值-->
<entry key="idsBegin">1</entry>
<!--需要转移结束Id值-->
<entry key="idsEnd">500000</entry>
<!--每次转移的Id量-->
<entry key="idsIncrement">500</entry>
</properties>

View File

@ -0,0 +1,20 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">
<properties>
<comment>TableFlow</comment>
<entry key="pointerTableName">pointers</entry>
<entry key="migrationTask">migrationTask</entry>
<entry key="sourceTableName">phpchina</entry>
<entry key="targetTableName">t_knowledge</entry>
<entry key="sourceFields">deal_title,author,content,tags,reply_num,NULL,vote_num,collection_num,extractTime,url,created_time,"PhpChina",author_url,NULL,"posts",MD5(Url)</entry>
<entry key="targetFields">title,author,content,tags,reply_num,view_num,vote_num,collection_num,crawled_time,url,created_time,source,author_url,last_time,type,url_md5</entry>
<entry key="waitDataTime">10000</entry>
<entry key="andWhere"></entry>
<!--需要转移开始Id值-->
<entry key="idsBegin">1</entry>
<!--需要转移结束Id值-->
<entry key="idsEnd">500000</entry>
<!--每次转移的Id量-->
<entry key="idsIncrement">500</entry>
</properties>

View File

@ -0,0 +1,20 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">
<properties>
<comment>TableFlow</comment>
<entry key="pointerTableName">pointers</entry>
<entry key="migrationTask">migrationTask</entry>
<entry key="sourceTableName">slashdot</entry>
<entry key="targetTableName">t_knowledge</entry>
<entry key="sourceFields">deal_title,author,content,tags,reply_num,view_num,vote_num,collection_num,extractTime,url,created_time,"slashdot",author_url,NULL,"posts",MD5(Url)</entry>
<entry key="targetFields">title,author,content,tags,reply_num,view_num,vote_num,collection_num,crawled_time,url,created_time,source,author_url,last_time,type,url_md5</entry>
<entry key="waitDataTime">10000</entry>
<entry key="andWhere"></entry>
<!--需要转移开始Id值-->
<entry key="idsBegin">1</entry>
<!--需要转移结束Id值-->
<entry key="idsEnd">500000</entry>
<!--每次转移的Id量-->
<entry key="idsIncrement">500</entry>
</properties>

View File

@ -0,0 +1,16 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">
<properties>
<comment>TableFlow</comment>
<entry key="pointerTableName">pointers</entry>
<entry key="migrationTask">migrationTask</entry>
<entry key="sourceTableName">sourceforge_project</entry>
<entry key="targetTableName">t_project</entry>
<entry key="sourceFields">name,downloadCount,lastUpdate,`desc`,categories,license,programmingLanguage,registeredTime,pageMd5,history,platform,Url,"SourceForge",MD5(Url)</entry>
<entry key="targetFields">name,download_num,last_update_time,description,category,license,language,registered_time,pageMD5,history,platform,url,source,urlMD5</entry>
<entry key="waitDataTime">3600000</entry>
<entry key="andWhere">AND stars > 0 AND downloadCount > 0</entry>
<entry key="idsBegin">1</entry>
<entry key="idsEnd">500000</entry>
<entry key="idsIncrement">500</entry>
</properties>

View File

@ -0,0 +1,20 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">
<properties>
<comment>TableFlow</comment>
<entry key="pointerTableName">pointers</entry>
<entry key="migrationTask">migrationTask</entry>
<entry key="sourceTableName">stackoverflow_q</entry>
<entry key="targetTableName">t_knowledge</entry>
<entry key="sourceFields">questionTitle,author,questionContent,tag,answerNum,viewNum,voteNum,likeNum,extractTime,Url,postTime,"StackOverflow",authorUrl,activeTime,"topic",MD5(Url)</entry>
<entry key="targetFields">title,author,content,tags,reply_num,view_num,vote_num,collection_num,crawled_time,url,created_time,source,author_url,last_time,type,url_md5</entry>
<entry key="waitDataTime">10000</entry>
<entry key="andWhere"></entry>
<!--需要转移开始Id值-->
<entry key="idsBegin">1</entry>
<!--需要转移结束Id值-->
<entry key="idsEnd">500000</entry>
<!--每次转移的Id量-->
<entry key="idsIncrement">500</entry>
</properties>

View File

@ -0,0 +1,23 @@
#!/bin/bash
task=slashdot_posts_to_t_kowledge
find ./target/classes -name "*.properties"|xargs rm -f
find ./target/classes -name "*.xml"|xargs rm -f
find ./target/classes -name "*.dic"|xargs rm -f
find ./target/classes/spring |xargs rm -f -r
#export CLASSPATH=$CURR_DIR/lib:$CURR_DIR:$JAVA_HOME/lib:$JAVA_HOME/jre/lib
tmp='./bin/resources'
tmp='./target/classes':$tmp
tmp='./target/tFlow-1.0-SNAPSHOT-jar-with-dependencies-without-resources/*':$tmp
CLASSPATH=$tmp:$CLASSPATH
echo $CLASSPATH
JVM_ARGS="-Xmn48m -Xmx128m -Xms128m -XX:NewRatio=4 -XX:SurvivorRatio=4 -XX:MaxTenuringThreshold=2"
#echo JVM_ARGS=$JVM_ARGS
#ulimit -n 400000
#echo "" > nohup.out
java $JVM_ARGS -classpath $CLASSPATH net.trustie.db.TableFlow $task >>log/${task}.log 2>&1 &

View File

@ -0,0 +1,24 @@
#!/bin/bash
task=sourceforge_project_to_t_project
find ./target/classes -name "*.properties"|xargs rm -f
find ./target/classes -name "*.xml"|xargs rm -f
find ./target/classes -name "*.dic"|xargs rm -f
find ./target/classes/spring |xargs rm -f -r
#export CLASSPATH=$CURR_DIR/lib:$CURR_DIR:$JAVA_HOME/lib:$JAVA_HOME/jre/lib
tmp='./bin/resources'
tmp='./target/classes':$tmp
tmp='./target/tFlow-1.0-SNAPSHOT-jar-with-dependencies-without-resources/*':$tmp
CLASSPATH=$tmp:$CLASSPATH
echo $CLASSPATH
JVM_ARGS="-Xmn48m -Xmx128m -Xms128m -XX:NewRatio=4 -XX:SurvivorRatio=4 -XX:MaxTenuringThreshold=2"
#echo JVM_ARGS=$JVM_ARGS
#ulimit -n 400000
#echo "" > nohup.out
java $JVM_ARGS -classpath $CLASSPATH net.trustie.db.TableFlow $task >>log/${task}.log 2>&1 &

View File

@ -0,0 +1,24 @@
#!/bin/bash
task=stackoverflow_q_to_t_knowledge
find ./target/classes -name "*.properties"|xargs rm -f
find ./target/classes -name "*.xml"|xargs rm -f
find ./target/classes -name "*.dic"|xargs rm -f
find ./target/classes/spring |xargs rm -f -r
#export CLASSPATH=$CURR_DIR/lib:$CURR_DIR:$JAVA_HOME/lib:$JAVA_HOME/jre/lib
tmp='./bin/resources'
tmp='./target/classes':$tmp
tmp='./target/tFlow-1.0-SNAPSHOT-jar-with-dependencies-without-resources/*':$tmp
CLASSPATH=$tmp:$CLASSPATH
echo $CLASSPATH
JVM_ARGS="-Xmn48m -Xmx128m -Xms128m -XX:NewRatio=4 -XX:SurvivorRatio=4 -XX:MaxTenuringThreshold=2"
#echo JVM_ARGS=$JVM_ARGS
#ulimit -n 400000
#echo "" > nohup.out
java $JVM_ARGS -classpath $CLASSPATH net.trustie.db.TableFlow $task >>log/${task}.log 2>&1 &

24
gather_posts/bin/start.sh Normal file
View File

@ -0,0 +1,24 @@
#!/bin/bash
task=test
find ./target/classes -name "*.properties"|xargs rm -f
find ./target/classes -name "*.xml"|xargs rm -f
find ./target/classes -name "*.dic"|xargs rm -f
find ./target/classes/spring |xargs rm -f -r
#export CLASSPATH=$CURR_DIR/lib:$CURR_DIR:$JAVA_HOME/lib:$JAVA_HOME/jre/lib
tmp='./bin/resources'
tmp='./target/classes':$tmp
tmp='./target/tFlow-1.0-SNAPSHOT-jar-with-dependencies-without-resources/*':$tmp
CLASSPATH=$tmp:$CLASSPATH
echo $CLASSPATH
JVM_ARGS="-Xmn48m -Xmx128m -Xms128m -XX:NewRatio=4 -XX:SurvivorRatio=4 -XX:MaxTenuringThreshold=2"
#echo JVM_ARGS=$JVM_ARGS
#ulimit -n 400000
#echo "" > nohup.out
java $JVM_ARGS -classpath $CLASSPATH net.trustie.db.TableFlow $task >>log/${task}.log 2>&1 &

View File

@ -0,0 +1,19 @@
#!/bin/bash
sh bin/cnblog_news_to_t_knowledge.sh
sh bin/cnblog_question_to_t_knowledge.sh
sh bin/csdn_ask_to_t_knowledge.sh
sh bin/csdn_blogs_to_t_knowledge.sh
sh bin/csdn_topics_to_t_knowledge.sh
sh bin/dewen_question_to_t_knowledge.sh
sh bin/freecode_projects_to_t_project.sh
sh bin/iteye_ask_to_t_knowledge.sh
sh bin/openhub_project_to_t_project.sh
sh bin/oschina_project_to_t_project.sh
sh bin/oschina_question_to_t_knowledge.sh
sh bin/sourceforge_project_to_t_project.sh
sh bin/stackoverflow_q_to_t_knowledge.sh
sh bin/51cto_blog_to_t_knowledge.sh
sh bin/linuxtone_posts_to_t_konwlege.sh
sh bin/slashdot_posts_to_t_kowledge.sh
sh bin/phpchina_posts_to_t_knowledge,sh

93
gather_posts/pom.xml Normal file
View File

@ -0,0 +1,93 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>cn.edu.zhanyun</groupId>
<artifactId>gather_posts</artifactId>
<version>0.0.1-SNAPSHOT</version>
<packaging>jar</packaging>
<name>gather_posts</name>
<url>http://maven.apache.org</url>
<build>
<sourceDirectory>${basedir}/src/main/java</sourceDirectory>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>1.7</source>
<target>1.7</target>
</configuration>
</plugin>
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<version>2.5.1</version>
<configuration>
<descriptors>
<descriptor>src/main/assembly/assembly.xml</descriptor>
</descriptors>
</configuration>
</plugin>
</plugins>
</build>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>3.8.1</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>1.7.7</version>
</dependency>
<dependency>
<groupId>commons-collections</groupId>
<artifactId>commons-collections</artifactId>
<version>3.2.1</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-io</artifactId>
<version>1.3.2</version>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-context</artifactId>
<version>4.1.4.RELEASE</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.1</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.18</version>
</dependency>
<dependency>
<groupId>commons-dbcp</groupId>
<artifactId>commons-dbcp</artifactId>
<version>1.3</version>
</dependency>
<dependency>
<groupId>org.mybatis</groupId>
<artifactId>mybatis</artifactId>
<version>3.1.1</version>
</dependency>
<dependency>
<groupId>org.mybatis</groupId>
<artifactId>mybatis-spring</artifactId>
<version>1.1.1</version>
</dependency>
</dependencies>
</project>

View File

@ -0,0 +1,12 @@
package org.ossean.transfertknowledgeandtagmatch.dao;
import java.util.List;
import org.apache.ibatis.annotations.Param;
import org.apache.ibatis.annotations.Select;
import org.ossean.transfertknowledgeandtagmatch.model.JobRequirement;
public interface JobRequirementDAO {
@Select("SELECT * from job_requirements WHERE id > #{id} limit #{batchSize}")
public List<JobRequirement> getJobRequirementList(@Param("id") int id, @Param("batchSize") int batchSize);
}

View File

@ -0,0 +1,20 @@
package org.ossean.transfertknowledgeandtagmatch.dao;
import org.apache.ibatis.annotations.Insert;
import org.apache.ibatis.annotations.Param;
import org.apache.ibatis.annotations.Select;
import org.apache.ibatis.annotations.Update;
public interface PointerDAO {
@Select("select Pointer from ${table} where SourceTableName=#{SourceTableName} and TargetTableName=#{TargetTableName}")
public int getPointer(@Param("table") String table, @Param("SourceTableName") String SourceTableName, @Param("TargetTableName") String TargetTableName);
@Update("update ${table} set Pointer=#{Pointer} where SourceTableName=#{SourceTableName} and TargetTableName=#{TargetTableName}")
public void updatePointer(@Param("table") String table, @Param("SourceTableName") String SourceTableName, @Param("TargetTableName") String TargetTableName, @Param("Pointer") int Pointer);
@Insert("insert into ${table} (`SourceTableName`,`TargetTableName`,`Pointer`) values (#{SourceTableName},#{TargetTableName},#{Pointer})")
public void insertPointer(@Param("table") String table, @Param("SourceTableName") String SourceTableName, @Param("TargetTableName") String TargetTableName, @Param("Pointer") int Pointer);
}

View File

@ -0,0 +1,21 @@
package org.ossean.transfertknowledgeandtagmatch.dao;
import java.util.List;
import java.util.Set;
import org.apache.ibatis.annotations.Param;
import org.apache.ibatis.annotations.Select;
import org.ossean.transfertknowledgeandtagmatch.model.RelativeMemo;
public interface RelativeMemoDao {
//批量获取帖子对象
@Select("select * from relative_memos where id>#{startId} AND id<=#{endId}")
public List<RelativeMemo> getMemoInfo(@Param("startId") int startId, @Param("endId") int endId);
//获得relative_memos中已存在的url_md5集合
@Select("SELECT DISTINCT(url_md5) AS md5 FROM relative_memos")
public Set<String> initUrlSet();
}

View File

@ -0,0 +1,328 @@
package org.ossean.transfertknowledgeandtagmatch.model;
public class JobRequirement {
private int id = 0;
private String author_url=null;
private String author =null;
private String url=null;
private String deal_title = null;
private String tags=null;
private String content=null;
private String salary=null;
private String experience=null;
private String comp_name=null;
private String comp_url = null;
private String addr = null;
private String comp_scale = null;
private String comp_field = null;
private String financing = null;
private String comp_vision =null;
private String created_time=null;
private String extractTime = null;
private int history = 0;
private String relative_osps = null;
private int reply_num = 0;
private int view_num = 0;
private int vote_num = 0;
private int collection_num = 0;
private String abstractString = null;
private String type = null;
private String crawled_time = null;
private String category = null;
private String source = null;
private String last_time = null;
private String url_md5 = null;
public JobRequirement(int id, String author_url, String author, String url,
String deal_title, String tags, String content, String salary,
String experience, String comp_name, String comp_url, String addr,
String comp_scale, String comp_field, String financing,
String comp_vision, String created_time, String extractTime,
int history, String relative_osps, int reply_num, int view_num,
int vote_num, int collection_num, String abstractString,
String type, String crawled_time, String category, String source,
String last_time, String url_md5) {
super();
this.id = id;
this.author_url = author_url;
this.author = author;
this.url = url;
this.deal_title = deal_title;
this.tags = tags;
this.content = content;
this.salary = salary;
this.experience = experience;
this.comp_name = comp_name;
this.comp_url = comp_url;
this.addr = addr;
this.comp_scale = comp_scale;
this.comp_field = comp_field;
this.financing = financing;
this.comp_vision = comp_vision;
this.created_time = created_time;
this.extractTime = extractTime;
this.history = history;
this.relative_osps = relative_osps;
this.reply_num = reply_num;
this.view_num = view_num;
this.vote_num = vote_num;
this.collection_num = collection_num;
this.abstractString = abstractString;
this.type = type;
this.crawled_time = crawled_time;
this.category = category;
this.source = source;
this.last_time = last_time;
this.url_md5 = url_md5;
}
public int getId() {
return id;
}
public void setId(int id) {
this.id = id;
}
public String getAuthor_url() {
return author_url;
}
public void setAuthor_url(String author_url) {
this.author_url = author_url;
}
public String getAuthor() {
return author;
}
public void setAuthor(String author) {
this.author = author;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getDeal_title() {
return deal_title;
}
public void setDeal_title(String deal_title) {
this.deal_title = deal_title;
}
public String getTags() {
return tags;
}
public void setTags(String tags) {
this.tags = tags;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public String getSalary() {
return salary;
}
public void setSalary(String salary) {
this.salary = salary;
}
public String getExperience() {
return experience;
}
public void setExperience(String experience) {
this.experience = experience;
}
public String getComp_name() {
return comp_name;
}
public void setComp_name(String comp_name) {
this.comp_name = comp_name;
}
public String getComp_url() {
return comp_url;
}
public void setComp_url(String comp_url) {
this.comp_url = comp_url;
}
public String getAddr() {
return addr;
}
public void setAddr(String addr) {
this.addr = addr;
}
public String getComp_scale() {
return comp_scale;
}
public void setComp_scale(String comp_scale) {
this.comp_scale = comp_scale;
}
public String getComp_field() {
return comp_field;
}
public void setComp_field(String comp_field) {
this.comp_field = comp_field;
}
public String getFinancing() {
return financing;
}
public void setFinancing(String financing) {
this.financing = financing;
}
public String getComp_vision() {
return comp_vision;
}
public void setComp_vision(String comp_vision) {
this.comp_vision = comp_vision;
}
public String getCreated_time() {
return created_time;
}
public void setCreated_time(String created_time) {
this.created_time = created_time;
}
public String getExtractTime() {
return extractTime;
}
public void setExtractTime(String extractTime) {
this.extractTime = extractTime;
}
public int getHistory() {
return history;
}
public void setHistory(int history) {
this.history = history;
}
public String getRelative_osps() {
return relative_osps;
}
public void setRelative_osps(String relative_osps) {
this.relative_osps = relative_osps;
}
public int getReply_num() {
return reply_num;
}
public void setReply_num(int reply_num) {
this.reply_num = reply_num;
}
public int getView_num() {
return view_num;
}
public void setView_num(int view_num) {
this.view_num = view_num;
}
public int getVote_num() {
return vote_num;
}
public void setVote_num(int vote_num) {
this.vote_num = vote_num;
}
public int getCollection_num() {
return collection_num;
}
public void setCollection_num(int collection_num) {
this.collection_num = collection_num;
}
public String getAbstractString() {
return abstractString;
}
public void setAbstractString(String abstractString) {
this.abstractString = abstractString;
}
public String getType() {
return type;
}
public void setType(String type) {
this.type = type;
}
public String getCrawled_time() {
return crawled_time;
}
public void setCrawled_time(String crawled_time) {
this.crawled_time = crawled_time;
}
public String getCategory() {
return category;
}
public void setCategory(String category) {
this.category = category;
}
public String getSource() {
return source;
}
public void setSource(String source) {
this.source = source;
}
public String getLast_time() {
return last_time;
}
public void setLast_time(String last_time) {
this.last_time = last_time;
}
public String getUrl_md5() {
return url_md5;
}
public void setUrl_md5(String url_md5) {
this.url_md5 = url_md5;
}
}

View File

@ -0,0 +1,209 @@
package org.ossean.transfertknowledgeandtagmatch.model;
public class RelativeMemo {
private int id;
private int osp_id;
private String subject;
private String content;
private String author;
private int replies_num;
private int lock;
private int sticky;
private String created_time;
private String updated_time;
private String url;
private int view_num_crawled;
private int vote_up_num;
private int collection_num;
private String abstractText;
private String memo_type;
private String source;
private String category;
private int view_num_trustie;
private int author_id;
private int parent_id;
private int last_reply_id;
private int is_quote;
private String username;
private String userhomeurl;
private String crawled_time;
private String author_url;
private String url_md5;
private String tags;
public int getId() {
return id;
}
public void setId(int id) {
this.id = id;
}
public int getOsp_id() {
return osp_id;
}
public void setOsp_id(int osp_id) {
this.osp_id = osp_id;
}
public String getSubject() {
return subject;
}
public void setSubject(String subject) {
this.subject = subject;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public String getAuthor() {
return author;
}
public void setAuthor(String author) {
this.author = author;
}
public int getReplies_num() {
return replies_num;
}
public void setReplies_num(int replies_num) {
this.replies_num = replies_num;
}
public int getLock() {
return lock;
}
public void setLock(int lock) {
this.lock = lock;
}
public int getSticky() {
return sticky;
}
public void setSticky(int sticky) {
this.sticky = sticky;
}
public String getCreated_time() {
return created_time;
}
public void setCreated_time(String created_time) {
this.created_time = created_time;
}
public String getUpdated_time() {
return updated_time;
}
public void setUpdated_time(String updated_time) {
this.updated_time = updated_time;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public int getView_num_crawled() {
return view_num_crawled;
}
public void setView_num_crawled(int view_num_crawled) {
this.view_num_crawled = view_num_crawled;
}
public int getVote_up_num() {
return vote_up_num;
}
public void setVote_up_num(int vote_up_num) {
this.vote_up_num = vote_up_num;
}
public int getCollection_num() {
return collection_num;
}
public void setCollection_num(int collection_num) {
this.collection_num = collection_num;
}
public String getAbstractText() {
return abstractText;
}
public void setAbstractText(String abstractText) {
this.abstractText = abstractText;
}
public String getMemo_type() {
return memo_type;
}
public void setMemo_type(String memo_type) {
this.memo_type = memo_type;
}
public String getSource() {
return source;
}
public void setSource(String source) {
this.source = source;
}
public String getCategory() {
return category;
}
public void setCategory(String category) {
this.category = category;
}
public int getView_num_trustie() {
return view_num_trustie;
}
public void setView_num_trustie(int view_num_trustie) {
this.view_num_trustie = view_num_trustie;
}
public int getAuthor_id() {
return author_id;
}
public void setAuthor_id(int author_id) {
this.author_id = author_id;
}
public int getParent_id() {
return parent_id;
}
public void setParent_id(int parent_id) {
this.parent_id = parent_id;
}
public int getLast_reply_id() {
return last_reply_id;
}
public void setLast_reply_id(int last_reply_id) {
this.last_reply_id = last_reply_id;
}
public int getIs_quote() {
return is_quote;
}
public void setIs_quote(int is_quote) {
this.is_quote = is_quote;
}
public String getUsername() {
return username;
}
public void setUsername(String username) {
this.username = username;
}
public String getUserhomeurl() {
return userhomeurl;
}
public void setUserhomeurl(String userhomeurl) {
this.userhomeurl = userhomeurl;
}
public String getCrawled_time() {
return crawled_time;
}
public void setCrawled_time(String crawled_time) {
this.crawled_time = crawled_time;
}
public String getAuthor_url() {
return author_url;
}
public void setAuthor_url(String author_url) {
this.author_url = author_url;
}
public String getUrl_md5() {
return url_md5;
}
public void setUrl_md5(String url_md5) {
this.url_md5 = url_md5;
}
public String getTags() {
return tags;
}
public void setTags(String tags) {
this.tags = tags;
}
}

View File

@ -0,0 +1,306 @@
package org.ossean.transfertknowledgeandtagmatch.model;
public class TKnowledge {
private int id = 0;
private String title=null;
private String author =null;
private String content=null;
private String tags = null;
private int replyNum=0;
private int viewNum=0;
private int voteNum=0;
private int collectionNum=0;
private String strAbstract=null;
private String type = null;
private String crawledTime =null;
private String url = null;
private String createdTime = null;
private String category=null;
private String source =null;
private String authorUrl=null;
private String lastTime = null;
private String urlMd5 =null;
/**
* @param id
* @param title
* @param author
* @param content
* @param tags
* @param replyNum
* @param viewNum
* @param voteNum
* @param collectionNum
* @param strAbstract
* @param type
* @param crawledTime
* @param url
* @param createdTime
* @param category
* @param source
* @param authorUrl
* @param lastTime
* @param urlMd5
*/
public TKnowledge(int id, String title, String author, String content,
String tags, int replyNum, int viewNum, int voteNum,
int collectionNum, String strAbstract, String type,
String crawledTime, String url, String createdTime, String category,
String source, String authorUrl, String lastTime, String urlMd5) {
this.id = id;
this.title = title;
this.author = author;
this.content = content;
this.tags = tags;
this.replyNum = replyNum;
this.viewNum = viewNum;
this.voteNum = voteNum;
this.collectionNum = collectionNum;
this.strAbstract = strAbstract;
this.type = type;
this.crawledTime = crawledTime;
this.url = url;
this.createdTime = createdTime;
this.category = category;
this.source = source;
this.authorUrl = authorUrl;
this.lastTime = lastTime;
this.urlMd5 = urlMd5;
// this.title=StringUtils.replace(this.title, "'", "\'");
// this.content=StringUtils.replace(this.content, "'", "\'");
// this.author = StringUtils.replace(this.author, "'", "\'");
}
/**
* @return the id
*/
public int getId() {
return id;
}
/**
* @return the title
*/
public String getTitle() {
return title;
}
/**
* @return the author
*/
public String getAuthor() {
return author;
}
/**
* @return the content
*/
public String getContent() {
return content;
}
/**
* @return the tags
*/
public String getTags() {
return tags;
}
/**
* @return the replyNum
*/
public int getReplyNum() {
return replyNum;
}
/**
* @return the viewNum
*/
public int getViewNum() {
return viewNum;
}
/**
* @return the voteNum
*/
public int getVoteNum() {
return voteNum;
}
/**
* @return the collectionNum
*/
public int getCollectionNum() {
return collectionNum;
}
/**
* @return the strAbstract
*/
public String getStrAbstract() {
return strAbstract;
}
/**
* @return the type
*/
public String getType() {
return type;
}
/**
* @return the crawledTime
*/
public String getCrawledTime() {
return crawledTime;
}
/**
* @return the url
*/
public String getUrl() {
return url;
}
/**
* @return the createdTime
*/
public String getCreatedTime() {
return createdTime;
}
/**
* @return the category
*/
public String getCategory() {
return category;
}
/**
* @return the source
*/
public String getSource() {
return source;
}
/**
* @return the authorUrl
*/
public String getAuthorUrl() {
return authorUrl;
}
/**
* @return the lastTime
*/
public String getLastTime() {
return lastTime;
}
/**
* @return the urlMd5
*/
public String getUrlMd5() {
return urlMd5;
}
/**
* @param id the id to set
*/
public void setId(int id) {
this.id = id;
}
/**
* @param title the title to set
*/
public void setTitle(String title) {
this.title = title;
}
/**
* @param author the author to set
*/
public void setAuthor(String author) {
this.author = author;
}
/**
* @param content the content to set
*/
public void setContent(String content) {
this.content = content;
}
/**
* @param tags the tags to set
*/
public void setTags(String tags) {
this.tags = tags;
}
/**
* @param replyNum the replyNum to set
*/
public void setReplyNum(int replyNum) {
this.replyNum = replyNum;
}
/**
* @param viewNum the viewNum to set
*/
public void setViewNum(int viewNum) {
this.viewNum = viewNum;
}
/**
* @param voteNum the voteNum to set
*/
public void setVoteNum(int voteNum) {
this.voteNum = voteNum;
}
/**
* @param collectionNum the collectionNum to set
*/
public void setCollectionNum(int collectionNum) {
this.collectionNum = collectionNum;
}
/**
* @param strAbstract the strAbstract to set
*/
public void setStrAbstract(String strAbstract) {
this.strAbstract = strAbstract;
}
/**
* @param type the type to set
*/
public void setType(String type) {
this.type = type;
}
/**
* @param crawledTime the crawledTime to set
*/
public void setCrawledTime(String crawledTime) {
this.crawledTime = crawledTime;
}
/**
* @param url the url to set
*/
public void setUrl(String url) {
this.url = url;
}
/**
* @param createdTime the createdTime to set
*/
public void setCreatedTime(String createdTime) {
this.createdTime = createdTime;
}
/**
* @param category the category to set
*/
public void setCategory(String category) {
this.category = category;
}
/**
* @param source the source to set
*/
public void setSource(String source) {
this.source = source;
}
/**
* @param authorUrl the authorUrl to set
*/
public void setAuthorUrl(String authorUrl) {
this.authorUrl = authorUrl;
}
/**
* @param lastTime the lastTime to set
*/
public void setLastTime(String lastTime) {
this.lastTime = lastTime;
}
/**
* @param urlMd5 the urlMd5 to set
*/
public void setUrlMd5(String urlMd5) {
this.urlMd5 = urlMd5;
}
}

View File

@ -0,0 +1,38 @@
package cn.edu.zhanyun.gather_posts;
import junit.framework.Test;
import junit.framework.TestCase;
import junit.framework.TestSuite;
/**
* Unit test for simple App.
*/
public class AppTest
extends TestCase
{
/**
* Create the test case
*
* @param testName name of the test case
*/
public AppTest( String testName )
{
super( testName );
}
/**
* @return the suite of tests being tested
*/
public static Test suite()
{
return new TestSuite( AppTest.class );
}
/**
* Rigourous Test :-)
*/
public void testApp()
{
assertTrue( true );
}
}

View File

@ -31,7 +31,7 @@ public class PointerDAO {
try {
pst.close();
} catch (SQLException e) {
logger.info("error" + e);
logger.info("sql error! " + e);
}
}
return pointer;
@ -75,7 +75,7 @@ public class PointerDAO {
try {
pst.close();
} catch (SQLException e) {
logger.info("sql error ", e);
logger.info("sql error<EFBFBD>", e);
}
}
}