This commit is contained in:
waterbeach 2017-02-28 10:34:48 +08:00
commit 0daac4a3e2
24 changed files with 1787 additions and 638 deletions

View File

@ -4,9 +4,13 @@
<list default="true" id="f59140c6-b612-48dd-baf7-0162af9922f0" name="Default" comment="">
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/.idea/workspace.xml" afterPath="$PROJECT_DIR$/.idea/workspace.xml" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/../../../new_osseanextractor/.idea/workspace.xml" afterPath="$PROJECT_DIR$/../../../new_osseanextractor/.idea/workspace.xml" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/../../../new_osseanextractor/src/main/java/net/trustie/dao/OpenHubProject_Dao.java" afterPath="$PROJECT_DIR$/../../../new_osseanextractor/src/main/java/net/trustie/dao/OpenHubProject_Dao.java" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/../../../new_osseanextractor/osseanextractor.iml" afterPath="$PROJECT_DIR$/../../../new_osseanextractor/osseanextractor.iml" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/../../../new_osseanextractor/pom.xml" afterPath="$PROJECT_DIR$/../../../new_osseanextractor/pom.xml" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/../../../new_osseanextractor/src/main/java/net/trustie/model/openhub_Model.java" afterPath="$PROJECT_DIR$/../../../new_osseanextractor/src/main/java/net/trustie/model/openhub_Model.java" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/../../../new_osseanextractor/src/main/java/net/trustie/model/sourceforge_Model.java" afterPath="$PROJECT_DIR$/../../../new_osseanextractor/src/main/java/net/trustie/model/sourceforge_Model.java" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/../../../new_osseanextractor/src/main/java/net/trustie/one/ExtractThread.java" afterPath="$PROJECT_DIR$/../../../new_osseanextractor/src/main/java/net/trustie/one/ExtractThread.java" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/../../../new_osseanextractor/src/main/java/net/trustie/one/OpenhubReExtractor.java" afterPath="$PROJECT_DIR$/../../../new_osseanextractor/src/main/java/net/trustie/one/OpenhubReExtractor.java" />
<change type="MODIFICATION" beforePath="$PROJECT_DIR$/../../../new_osseanextractor/src/main/java/net/trustie/utils/ExtractMutilLink4Openhub.java" afterPath="$PROJECT_DIR$/../../../new_osseanextractor/src/main/java/net/trustie/utils/ExtractMutilLink4Openhub.java" />
</list>
<ignored path="$PROJECT_DIR$/target/" />
<option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
@ -33,28 +37,48 @@
</component>
<component name="FileEditorManager">
<leaf SIDE_TABS_SIZE_LIMIT_KEY="300">
<file leaf-file-name="ListHtmlCrawler.java" pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/src/main/java/net/trustie/webmagic/one/ListHtmlCrawler.java">
<file leaf-file-name="DetailHtmlProcessor.java" pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/src/main/java/net/trustie/webmagic/one/DetailHtmlProcessor.java">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="442">
<caret line="72" column="0" lean-forward="false" selection-start-line="72" selection-start-column="0" selection-end-line="72" selection-end-column="0" />
<state relative-caret-position="307">
<caret line="45" column="43" lean-forward="true" selection-start-line="45" selection-start-column="43" selection-end-line="45" selection-end-column="43" />
<folding />
</state>
</provider>
</entry>
</file>
<file leaf-file-name="ListHtmlProcessor.java" pinned="false" current-in-tab="true">
<entry file="file://$PROJECT_DIR$/src/main/java/net/trustie/webmagic/one/ListHtmlProcessor.java">
<file leaf-file-name="DetailHtmlCrawler.java" pinned="false" current-in-tab="true">
<entry file="file://$PROJECT_DIR$/src/main/java/net/trustie/webmagic/one/DetailHtmlCrawler.java">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="238">
<caret line="44" column="22" lean-forward="true" selection-start-line="44" selection-start-column="22" selection-end-line="44" selection-end-column="22" />
<state relative-caret-position="-951">
<caret line="131" column="57" lean-forward="true" selection-start-line="131" selection-start-column="57" selection-end-line="131" selection-end-column="57" />
<folding />
</state>
</provider>
</entry>
</file>
<file leaf-file-name="DetailHtmlPipeline.java" pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/src/main/java/net/trustie/webmagic/one/DetailHtmlPipeline.java">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="255">
<caret line="33" column="57" lean-forward="true" selection-start-line="33" selection-start-column="57" selection-end-line="33" selection-end-column="57" />
<folding>
<element signature="e#4827#4835#0" expanded="true" />
<element signature="imports" expanded="true" />
</folding>
</state>
</provider>
</entry>
</file>
<file leaf-file-name="DetailHtmlDao.java" pinned="false" current-in-tab="false">
<entry file="file://$PROJECT_DIR$/src/main/java/net/trustie/webmagic/one/dao/DetailHtmlDao.java">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="123">
<caret line="17" column="26" lean-forward="true" selection-start-line="17" selection-start-column="26" selection-end-line="17" selection-end-column="26" />
<folding />
</state>
</provider>
</entry>
</file>
</leaf>
</component>
<component name="Git.Settings">
@ -144,7 +168,6 @@
<foldersAlwaysOnTop value="true" />
</navigator>
<panes>
<pane id="PackagesPane" />
<pane id="ProjectPane">
<subPane>
<PATH>
@ -157,10 +180,241 @@
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
</PATH>
<PATH>
<PATH_ELEMENT>
<option name="myItemId" value="fetchnetworks" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.ProjectViewProjectNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="fetch_networks" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="src" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
</PATH>
<PATH>
<PATH_ELEMENT>
<option name="myItemId" value="fetchnetworks" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.ProjectViewProjectNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="fetch_networks" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="src" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="main" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
</PATH>
<PATH>
<PATH_ELEMENT>
<option name="myItemId" value="fetchnetworks" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.ProjectViewProjectNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="fetch_networks" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="src" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="main" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="resources" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
</PATH>
<PATH>
<PATH_ELEMENT>
<option name="myItemId" value="fetchnetworks" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.ProjectViewProjectNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="fetch_networks" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="src" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="main" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="java" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="webmagic" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="utils" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
</PATH>
<PATH>
<PATH_ELEMENT>
<option name="myItemId" value="fetchnetworks" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.ProjectViewProjectNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="fetch_networks" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="src" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="main" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="java" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="webmagic" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
</PATH>
<PATH>
<PATH_ELEMENT>
<option name="myItemId" value="fetchnetworks" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.ProjectViewProjectNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="fetch_networks" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="src" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="main" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="java" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="webmagic" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="one" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
</PATH>
<PATH>
<PATH_ELEMENT>
<option name="myItemId" value="fetchnetworks" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.ProjectViewProjectNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="fetch_networks" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="src" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="main" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="java" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="webmagic" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="one" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="model" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
</PATH>
<PATH>
<PATH_ELEMENT>
<option name="myItemId" value="fetchnetworks" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.ProjectViewProjectNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="fetch_networks" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="src" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="main" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="java" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="webmagic" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="one" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="dao" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
</PATH>
<PATH>
<PATH_ELEMENT>
<option name="myItemId" value="fetchnetworks" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.ProjectViewProjectNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="fetch_networks" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="src" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="main" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
<PATH_ELEMENT>
<option name="myItemId" value="java" />
<option name="myItemType" value="com.intellij.ide.projectView.impl.nodes.PsiDirectoryNode" />
</PATH_ELEMENT>
</PATH>
</subPane>
</pane>
<pane id="Scope" />
<pane id="Scratches" />
<pane id="PackagesPane" />
</panes>
</component>
<component name="PropertiesComponent">
@ -624,11 +878,14 @@
<workItem from="1486695485959" duration="431000" />
<workItem from="1486708709471" duration="13000" />
<workItem from="1487174066127" duration="4746000" />
<workItem from="1487307404778" duration="352000" />
<workItem from="1487335460841" duration="598000" />
<workItem from="1487407224064" duration="8108000" />
</task>
<servers />
</component>
<component name="TimeTrackingManager">
<option name="totallyTimeSpent" value="13656000" />
<option name="totallyTimeSpent" value="22714000" />
</component>
<component name="TodoView">
<todo-panel id="selected-file">
@ -641,12 +898,11 @@
</component>
<component name="ToolWindowManager">
<frame x="-8" y="-8" width="1382" height="744" extended-state="6" />
<editor active="false" />
<editor active="true" />
<layout>
<window_info id="Palette" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" />
<window_info id="TODO" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="5" side_tool="false" content_ui="tabs" />
<window_info id="Nl-Palette" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="2" side_tool="false" content_ui="tabs" />
<window_info id="Messages" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.32761577" sideWeight="0.5" order="6" side_tool="false" content_ui="tabs" />
<window_info id="Palette&#9;" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" />
<window_info id="Image Layers" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="2" side_tool="false" content_ui="tabs" />
<window_info id="Capture Analysis" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" />
@ -659,7 +915,7 @@
<window_info id="Terminal" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="10" side_tool="false" content_ui="tabs" />
<window_info id="Capture Tool" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="2" side_tool="false" content_ui="tabs" />
<window_info id="Designer" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="2" side_tool="false" content_ui="tabs" />
<window_info id="Project" active="true" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" show_stripe_button="true" weight="0.23373677" sideWeight="0.5" order="0" side_tool="false" content_ui="combo" />
<window_info id="Project" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" show_stripe_button="true" weight="0.24508321" sideWeight="0.5" order="0" side_tool="false" content_ui="combo" />
<window_info id="Database" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" />
<window_info id="Structure" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
<window_info id="Ant Build" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
@ -671,6 +927,7 @@
<window_info id="Message" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" />
<window_info id="Commander" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" />
<window_info id="Hierarchy" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.25" sideWeight="0.5" order="2" side_tool="false" content_ui="combo" />
<window_info id="Messages" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.32761577" sideWeight="0.5" order="6" side_tool="false" content_ui="tabs" />
<window_info id="Inspection" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4" sideWeight="0.5" order="4" side_tool="false" content_ui="tabs" />
<window_info id="Find" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="1" side_tool="false" content_ui="tabs" />
</layout>
@ -694,7 +951,9 @@
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="442">
<caret line="72" column="0" lean-forward="false" selection-start-line="72" selection-start-column="0" selection-end-line="72" selection-end-column="0" />
<folding />
<folding>
<element signature="imports" expanded="false" />
</folding>
</state>
</provider>
</entry>
@ -702,9 +961,7 @@
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="0">
<caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
<folding>
<element signature="e#4827#4835#0" expanded="true" />
</folding>
<folding />
</state>
</provider>
</entry>
@ -712,7 +969,9 @@
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="442">
<caret line="72" column="0" lean-forward="false" selection-start-line="72" selection-start-column="0" selection-end-line="72" selection-end-column="0" />
<folding />
<folding>
<element signature="imports" expanded="false" />
</folding>
</state>
</provider>
</entry>
@ -720,8 +979,70 @@
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="0">
<caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
<folding />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src/main/java/net/trustie/webmagic/one/ListHtmlCrawler.java">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="442">
<caret line="72" column="0" lean-forward="false" selection-start-line="72" selection-start-column="0" selection-end-line="72" selection-end-column="0" />
<folding>
<element signature="e#4827#4835#0" expanded="true" />
<element signature="imports" expanded="false" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src/main/java/net/trustie/webmagic/one/ListHtmlProcessor.java">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="0">
<caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
<folding />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src/main/java/net/trustie/webmagic/one/ListHtmlCrawler.java">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="442">
<caret line="72" column="0" lean-forward="false" selection-start-line="72" selection-start-column="0" selection-end-line="72" selection-end-column="0" />
<folding>
<element signature="imports" expanded="false" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src/main/java/net/trustie/webmagic/one/ListHtmlProcessor.java">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="0">
<caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
<folding />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src/main/java/net/trustie/webmagic/one/ListHtmlCrawler.java">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="442">
<caret line="72" column="0" lean-forward="false" selection-start-line="72" selection-start-column="0" selection-end-line="72" selection-end-column="0" />
<folding>
<element signature="imports" expanded="false" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src/main/java/net/trustie/webmagic/one/ListHtmlProcessor.java">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="0">
<caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
<folding />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src/main/java/net/trustie/webmagic/one/ListHtmlCrawler.java">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="0">
<caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
<folding>
<element signature="imports" expanded="false" />
</folding>
</state>
</provider>
@ -730,7 +1051,9 @@
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="0">
<caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
<folding />
<folding>
<element signature="imports" expanded="false" />
</folding>
</state>
</provider>
</entry>
@ -738,7 +1061,9 @@
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="0">
<caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
<folding />
<folding>
<element signature="imports" expanded="false" />
</folding>
</state>
</provider>
</entry>
@ -746,7 +1071,9 @@
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="0">
<caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
<folding />
<folding>
<element signature="imports" expanded="false" />
</folding>
</state>
</provider>
</entry>
@ -754,7 +1081,9 @@
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="0">
<caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
<folding />
<folding>
<element signature="imports" expanded="false" />
</folding>
</state>
</provider>
</entry>
@ -762,15 +1091,9 @@
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="0">
<caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
<folding />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src/main/java/net/trustie/webmagic/one/ListHtmlCrawler.java">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="0">
<caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
<folding />
<folding>
<element signature="imports" expanded="false" />
</folding>
</state>
</provider>
</entry>
@ -786,11 +1109,7 @@
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="0">
<caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
<folding>
<element signature="imports" expanded="false" />
<element signature="e#3598#3607#0" expanded="false" />
<element signature="e#4491#4499#0" expanded="false" />
</folding>
<folding />
</state>
</provider>
</entry>
@ -802,18 +1121,6 @@
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src/main/java/net/trustie/webmagic/one/DetailHtmlCrawler.java">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="34">
<caret line="36" column="21" lean-forward="false" selection-start-line="36" selection-start-column="21" selection-end-line="36" selection-end-column="21" />
<folding>
<element signature="imports" expanded="false" />
<element signature="e#3598#3607#0" expanded="false" />
<element signature="e#4491#4499#0" expanded="false" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src/main/resources/spring/applicationContext-myBatis.xml">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="0">
@ -826,19 +1133,100 @@
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="442">
<caret line="72" column="0" lean-forward="false" selection-start-line="72" selection-start-column="0" selection-end-line="72" selection-end-column="0" />
<folding>
<element signature="imports" expanded="false" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/spider.conf.xml">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="0">
<caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
<folding />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src/main/main.iml">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="0">
<caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src/main/resources/spring/applicationContext.xml">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="0">
<caret line="0" column="0" lean-forward="false" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
<folding />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src/main/java/net/trustie/webmagic/one/ListHtmlProcessor.java">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="238">
<caret line="44" column="22" lean-forward="true" selection-start-line="44" selection-start-column="22" selection-end-line="44" selection-end-column="22" />
<state relative-caret-position="17">
<caret line="34" column="0" lean-forward="false" selection-start-line="34" selection-start-column="0" selection-end-line="34" selection-end-column="0" />
<folding />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src/main/java/net/trustie/webmagic/one/model/AbstractHtml.java">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-17">
<caret line="8" column="32" lean-forward="true" selection-start-line="8" selection-start-column="32" selection-end-line="8" selection-end-column="32" />
<folding />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src/main/java/net/trustie/webmagic/one/model/DetailHtml.java">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="187">
<caret line="11" column="25" lean-forward="true" selection-start-line="11" selection-start-column="25" selection-end-line="11" selection-end-column="25" />
<folding />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src/main/java/net/trustie/webmagic/utils/DBPipeline.java">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="68">
<caret line="4" column="54" lean-forward="true" selection-start-line="4" selection-start-column="54" selection-end-line="4" selection-end-column="54" />
<folding />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src/main/java/net/trustie/webmagic/one/dao/DetailHtmlDao.java">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="123">
<caret line="17" column="26" lean-forward="true" selection-start-line="17" selection-start-column="26" selection-end-line="17" selection-end-column="26" />
<folding />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src/main/java/net/trustie/webmagic/one/DetailHtmlPipeline.java">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="255">
<caret line="33" column="57" lean-forward="true" selection-start-line="33" selection-start-column="57" selection-end-line="33" selection-end-column="57" />
<folding>
<element signature="e#4827#4835#0" expanded="true" />
<element signature="imports" expanded="true" />
</folding>
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src/main/java/net/trustie/webmagic/one/DetailHtmlProcessor.java">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="307">
<caret line="45" column="43" lean-forward="true" selection-start-line="45" selection-start-column="43" selection-end-line="45" selection-end-column="43" />
<folding />
</state>
</provider>
</entry>
<entry file="file://$PROJECT_DIR$/src/main/java/net/trustie/webmagic/one/DetailHtmlCrawler.java">
<provider selected="true" editor-type-id="text-editor">
<state relative-caret-position="-951">
<caret line="131" column="57" lean-forward="true" selection-start-line="131" selection-start-column="57" selection-end-line="131" selection-end-column="57" />
<folding />
</state>
</provider>
</entry>
</component>
</project>

File diff suppressed because it is too large Load Diff

View File

@ -11,7 +11,6 @@
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
<orderEntry type="library" name="Maven: commons-io:commons-io:2.4" level="project" />
<orderEntry type="library" name="Maven: junit:junit-dep:4.10" level="project" />
<orderEntry type="library" name="Maven: org.hamcrest:hamcrest-core:1.1" level="project" />
<orderEntry type="library" name="Maven: us.codecraft:webmagic-core:0.5.2" level="project" />

View File

@ -82,6 +82,5 @@
<artifactId>commons-io</artifactId>
<version>2.4</version>
</dependency>
</dependencies>
</project>

View File

@ -0,0 +1,46 @@
package net.trustie.dao;
import net.trustie.model.openhub_retry_Model;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
/**
* Created by zaihuilvcha on 2017/2/18.
*/
public class OpenHubRetry_Dao {
private Connection conn = null;
private Statement stmt = null;
public OpenHubRetry_Dao() {
try {
Class.forName("com.mysql.jdbc.Driver");
String url = "jdbc:mysql://localhost:3306/extract_result?user=root&password=123456";
conn = DriverManager.getConnection(url);
stmt = conn.createStatement();
} catch (ClassNotFoundException e) {
e.printStackTrace();
} catch (SQLException e) {
e.printStackTrace();
}
}
public int add(openhub_retry_Model oprm) {
try {
String sql = "INSERT INTO `extract_result`.`openhub_download_fail` (`url`, `html`) VALUES (?, ?);";
PreparedStatement ps = conn.prepareStatement(sql);
ps.setString(1, oprm.getUrl());
ps.setString(2,oprm.getHtml());
return ps.executeUpdate();
} catch (SQLException e) {
e.printStackTrace();
}
return -1;
}
}

View File

@ -6,7 +6,8 @@ import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import net.trustie.utils.DateHandler;
import core.*;
import net.trustie.dao.OpenHubRetry_Dao;
import net.trustie.utils.ExtractMutilLink4Openhub;
import net.trustie.utils.Seperator;
import net.trustie.utils.StringHandler;
@ -18,9 +19,8 @@ import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import core.AfterExtractor;
import core.Page;
import core.ValidateExtractor;
import org.springframework.context.ApplicationContext;
import org.springframework.context.support.ClassPathXmlApplicationContext;
import us.codecraft.webmagic.model.annotation.ExtractBy;
@ExtractBy("//div[@id='projects_show_page']")
@ -49,10 +49,11 @@ public class openhub_Model implements AfterExtractor, ValidateExtractor {
// "| //*[@id='projects_show_page']/div[2]/div[3]/div[2]/div/*/*/a/regex(\"<a href='(.*?)'>.*Homepage</a>\",1)/@href" //jquery multi links
// +"| //*[@id='projects_show_page']/div[2]/div[4]/div[2]/div/*/*/a/regex(\"<a href='(.*?)'>.*Homepage</a>\",1)/@href "
//)
@ExtractBy(value="//a/regex(\"<a.*href=\"(.*?)\".*Homepage.*</a>\",1) " +
" | //a/regex(\"<a.*href=\"(.*?)\".*Download.*</a>\",1)" +
" | //a/regex(\"<a.*href=\"(.*?)\".*Other.*</a>\",1)")
private List<String> homepages = new ArrayList<String>();
// @ExtractBy(value="//a/regex(\"<a.*href=\"(.*?)\".*Homepage.*</a>\",1) " +
// " | //a/regex(\"<a.*href=\"(.*?)\".*Download.*</a>\",1)" +
// " | //a/regex(\"<a.*href=\"(.*?)\".*Other.*</a>\",1)")
@ExtractBy(value="//a/regex(\"<a.*href=\"(.*?)\".*Homepage.*</a>\",1) ")
private List<String> homepages = new ArrayList<String>();
private static String homepage ="";
///////////////////////////////////
@ -401,6 +402,7 @@ private List<String> homepages = new ArrayList<String>();
}
@Override
public void validate(Page page) {
//
@ -423,18 +425,43 @@ private List<String> homepages = new ArrayList<String>();
}
this.rateLevel = this.rateLevel.substring(0, this.rateLevel.indexOf("/"));
if (StringHandler.isAtLeastOneBlank(this.name, this.activity
,this.description
,getHomepage()
,this.getHomepage()
/* ,
* this.licenses
*/)) {
page.setResultSkip(this, true);
//多homepage抽取时下载失败需要将对应page存库.downloadFailFlag为存库失败标志
if(downloadFailFlag) {
//存库开始先将下载失败标志置为假
downloadFailFlag = false;
System.out.println("homepage页面下载失败条目准备入库...");
openhub_retry_Model oprm = new openhub_retry_Model();
oprm.setUrl(page.getPageUrl());
oprm.setHtml(page.getRawText());
OpenHubRetry_Dao opDao = new OpenHubRetry_Dao();
opDao.add(oprm);
System.out.println("下载失败条目已存入数据库...");
}
return;
}
}
private void handleQuickRef(Element quickRef) {
@ -1274,6 +1301,7 @@ private List<String> homepages = new ArrayList<String>();
this.history = history;
}
private boolean downloadFailFlag = false;
public String getValidHomepage(String homePage){
String result = homePage;
@ -1281,6 +1309,16 @@ private List<String> homepages = new ArrayList<String>();
if(!homePage.equals("") && homePage.startsWith("/")){
//获取Homepage列表中的Homepage
result = new ExtractMutilLink4Openhub().extractLinks(homePage);
/**
* result=""需要对相应page加入存库处理
* 这里必然是多homepage情况所以此处若正常则result不为空若为空则说明下载失败了必须存库
*/
if(result.equals("")) {
//存库标志位置为true
downloadFailFlag = true;
}
}
return result;
}

View File

@ -0,0 +1,36 @@
package net.trustie.model;
/**
* Created by zaihuilvcha on 2017/2/18.
*/
public class openhub_retry_Model {
String url;
String html;
public void setUrl(String url) {
this.url = url;
}
public void setHtml(String html) {
this.html = html;
}
public String getUrl() {
return url;
}
public String getHtml() {
return html;
}
@Override
public String toString() {
return "openhub_retry_Model{" +
"url='" + url + '\'' +
", html='" + html + '\'' +
'}';
}
}

View File

@ -28,9 +28,9 @@ import java.util.regex.Pattern;
public class sourceforge_Model implements AfterExtractor, ValidateExtractor{
///////////////
@ExtractBy("//a[@id='homepage']/@href " +
" | //*[@class='homepage-link']/a/@href")
//@ExtractBy("//a[@id='homepage']/@href " +
// " | //*[@class='homepage-link']/a/@href")
@ExtractBy("//*[@id='homepage']/@href ")
private String homepage;
//////////////////

View File

@ -84,13 +84,16 @@ public class ExtractThread implements Runnable{
Extractor extractor = new Extractor();
RawPage result = null;
while(pages.size() > 0){
/**
* 注意这里的page就是detail表中的一条记录
*/
for( RawPage page : pages){
try{
long startTime=System.currentTimeMillis(); //获取开始时间
result = extractor.extract(page,pageModel);
long endTime=System.currentTimeMillis(); //获取结束时间
// System.out.println("页面抽取时间: "+(endTime-startTime)+"ms");
// System.out.println("页面抽取时间: "+(endTime-startTime)+"ms");
startTime=System.currentTimeMillis(); //获取开始时间
//持久化 更新抽取历史
saveResult(site,result);
@ -98,10 +101,11 @@ public class ExtractThread implements Runnable{
// System.out.println("结果保存时间: "+(endTime-startTime)+"ms");
}catch (Exception e){
e.printStackTrace();
// pageErrorOutPut.returnErrorPage(page, e);错误页面
// e.printStackTrace();
// pageErrorOutPut.returnErrorPage(page, e); //错误页面
}
}
//结果入库之后再更新lastId.注意处理一批才会存一次
updateLastId(site,lastId + pages.size());
lastId = getLastId(site);
pages = getPages(site,lastId);

View File

@ -95,7 +95,7 @@ class ReExtractThread implements Runnable{
saveResult(site, rawPage);
}catch (Exception e){
e.printStackTrace();
// pageErrorOutPut.returnErrorPage(page, e);错误页面
// pageErrorOutPut.returnErrorPage(page, e); //错误页面
}
}
//更新抽取游标

View File

@ -12,7 +12,7 @@ import java.util.List;
* Description抽取openhub中多链接的项目
*/
public class ExtractMutilLink4Openhub implements PageProcessor{
private Site site = Site.me().setRetryTimes(10).setSleepTime(500);
private Site site = Site.me().setRetryTimes(5).setSleepTime(500);
private static List<String> homepages;
private static Page _page;
@ -40,27 +40,28 @@ public class ExtractMutilLink4Openhub implements PageProcessor{
}
}catch(Exception e) {
}
return ExtractMutilLink4Openhub.homepages;
}
/**
* 获取多个link;间隔
* 获取多个link;间隔.此方法为二次抽取的入口方法
* @param url
* @return
*/
public String extractLinks(String url){
String homepage = "";
doExtract(url);
doExtract(url); //如果成功了的话得到了homepages
try {
List<String> tempPages = getHomepages(url);
List<String> tempPages = getHomepages(url); //以防初次抽取的homepages没抽成功若没成功的话通过getHomepages继续抽
if (tempPages != null && !tempPages.isEmpty()) {
for (String link : tempPages) {
if(!homepage.contains(link)){//避免加入重复的链接
homepage += link + ";";
}
}
homepage = homepage.substring(0, homepage.lastIndexOf(";"));
homepage = homepage.substring(0, homepage.lastIndexOf(";")); //将结果最后的去除
}
}catch (Exception e){
e.printStackTrace();
@ -89,7 +90,7 @@ public class ExtractMutilLink4Openhub implements PageProcessor{
* 抽取Homepage核心方法
* @param url 抽取目标url链接
*/
public static void doExtract(String url){
public static void doExtract(String url) {
Spider.create(extractMutilLink4Openhub).addUrl(new String("https://www.openhub.net"+url)).thread(1).run();
}

View File

@ -47,10 +47,13 @@ public interface OpenSourceProjectDao {
@Param("tags") String tags);
// 批量获取项目
@Select("select id,source,url,filtration from open_source_projects where filtration=0 limit #{batchSize}")
public List<OpenSourceProject> getBatchPrjs(
@Select("select id,source,url,filtration from open_source_projects where filtration=0 and id >= #{startId} limit #{batchSize}")
public List<OpenSourceProject> getBatchPrjs(@Param("startId") int startId,
@Param("batchSize") int batchSize);
@Select("select min(id) from open_source_projects where filtration=#{filtration}")
public int getMinId(@Param("filtration") int filtration);
// 删除项目
@Update("delete from open_source_projects where id=#{id}")
public void deleteProject(@Param("id") int id);

View File

@ -3,7 +3,7 @@ package com.ossean.projectmanager.model;
public class OpenhubProject {
private String description;
private String name;
private String codeRepository;
private String code_repository;
public String getDescription() {
return description;
}
@ -16,10 +16,11 @@ public class OpenhubProject {
public void setName(String name) {
this.name = name;
}
public String getCodeRepository() {
return codeRepository;
public String getCode_repository() {
return code_repository;
}
public void setCodeRepository(String codeRepository) {
this.codeRepository = codeRepository;
public void setCode_repository(String code_repository) {
this.code_repository = code_repository;
}
}

View File

@ -37,9 +37,10 @@ public class ProjectsFilter {
*/
public void filtratePrjs() {
logger.info("Reading projects......");
int startId = lastProjectDao.getMinId(0);
while (true) {
List<OpenSourceProject> prjsList = lastProjectDao
.getBatchPrjs(batchsize);
.getBatchPrjs(startId,batchsize);
if(prjsList.size()==0){
logger.info("Filter done......sleeping......");
try {
@ -81,16 +82,16 @@ public class ProjectsFilter {
&& !"".equals(openhubProject.getName())
&& openhubProject.getDescription() != null
&& !"".equals(openhubProject.getDescription())
&& openhubProject.getCodeRepository() != null
&& !openhubProject.getCodeRepository().contains(
"add a code location")) { // openhub的筛选条件为namedescription不为空且该项目有版本库
&& openhubProject.getCode_repository() != null
&& !openhubProject.getCode_repository().contains(
"Add a code location")) { // openhub的筛选条件为namedescription不为空且该项目有版本库
lastProjectDao.updateFiltratedPrj(project.getId(),
1); // 筛选标识从0变为1表示该项目经过筛选新增的
matchResultDao.deleteMatchResult(
getTargetTable(project.getId()),
project.getId()); // 删除该项目的匹配结果确保无之前的匹配结果
} else {
lastProjectDao.deleteProject(project.getId());
//lastProjectDao.deleteProject(project.getId());
matchResultDao.deleteMatchResult(
getTargetTable(project.getId()),
project.getId()); // 删除该项目的匹配结果
@ -111,7 +112,7 @@ public class ProjectsFilter {
getTargetTable(project.getId()),
project.getId()); // 删除该项目的匹配结果确保无之前的匹配结果
} else {
lastProjectDao.deleteProject(project.getId());
//lastProjectDao.deleteProject(project.getId());
matchResultDao.deleteMatchResult(
getTargetTable(project.getId()),
project.getId()); // 删除该项目的匹配结果
@ -127,6 +128,7 @@ public class ProjectsFilter {
logger.info("Unknown source... source = " + source);
}
}
startId = prjsList.get(prjsList.size()-1).getId()+1;
}

View File

@ -39,9 +39,9 @@
destroy-method="close">
<property name="driverClassName" value="com.mysql.jdbc.Driver" />
<property name="url"
value="jdbc:mysql://localhost:3306/ossean_new?characterEncoding=UTF-8&amp;zeroDateTimeBehavior=convertToNull&amp;autoReconnect=true" />
value="jdbc:mysql://localhost:3306/ossean_production?characterEncoding=UTF-8&amp;zeroDateTimeBehavior=convertToNull&amp;autoReconnect=true" />
<property name="username" value="root" />
<property name="password" value="123456" />
<property name="password" value="1234" />
<property name="validationQuery" value="SELECT 1" />
<property name="testOnBorrow" value="true"/>
</bean>

View File

@ -15,8 +15,14 @@
<arguments>
</arguments>
</buildCommand>
<buildCommand>
<name>org.springframework.ide.eclipse.core.springbuilder</name>
<arguments>
</arguments>
</buildCommand>
</buildSpec>
<natures>
<nature>org.springframework.ide.eclipse.core.springnature</nature>
<nature>org.eclipse.jdt.core.javanature</nature>
<nature>org.eclipse.m2e.core.maven2Nature</nature>
</natures>

View File

@ -17,7 +17,7 @@ public class ClearTableBeforTransfer {
Logger logger = Logger.getLogger(ClearTableBeforTransfer.class);
public void start(){
truncateTable(TableName.openSourceProjectsTableName);
deleteItemInEddPointers(TableName.eddRelationTableName, TableName.openSourceProjectsTableName);
deleteItemInEddPointers(TableName.pointerTableName,TableName.eddRelationTableName, TableName.openSourceProjectsTableName);
deleteTaggingsForProject(TableName.taggingsTableName);
logger.info("完成转移程序运行前数据表的清空和taggings表OpenSourceProject的删除");
}
@ -58,8 +58,8 @@ public class ClearTableBeforTransfer {
}
}
//删除edd_pointers表中transfer对应的数据
public void deleteItemInEddPointers(String sourceTableName, String targetTableName){
String sql = "delete from edd_pointers where SourceTableName=? and TargetTableName=?";
public void deleteItemInEddPointers(String pointerName,String sourceTableName, String targetTableName){
String sql = "delete from "+pointerName+" where SourceTableName=? and TargetTableName=?";
Connection conn = getConnection();
if(conn == null){
logger.info("没有获取到Connection");

View File

@ -38,6 +38,12 @@ public class GetSynonyms{
private static String synonymsTableName = TableName.synonymsTableName;
private static String pointerTableName = TableName.pointerTableName;
private static int batchSize = 1000;
//为了很好的将别名提取的程序结构化特设以下三个全局变量在从项目名称中提取别名时使用
String projectNameWithoutBracket = "";//项目名去除括号
String projectNameWithoutComName = "";//项目名称去除括号和公司基金名字
String bracket = "";//括号中内容
public void start() {
long start_all = System.currentTimeMillis();
@ -99,207 +105,517 @@ public class GetSynonyms{
}
public Map<String, Integer> getSynonyms(GatherProjectsModel model) {
if(model.getSource().equals("github"))
{
return getSynonymsFromGithubPrj(model);//处理github项目
}
else
{
return getSynonymsNotGitPrj(model);//处理非github的其他项目
}
}
@SuppressWarnings("finally")
public Map<String, Integer> getSynonymsFromGithubPrj(GatherProjectsModel model)
{
Map<String, Integer> SynonymMap = new HashMap<String, Integer>();// 记录每个别名及其准确度<Sysnonym,1>表时Synonym可靠性很大
List<String> synonymsList = new ArrayList<String>();// 最终返回的同义词列表
List<String> synonymsList_notsure = new ArrayList<String>();
model.setName(model.getName().trim());
if(model.getDescription()!=null && !model.getDescription().equals(""))
if(model.getDescription() != null)
{
model.setDescription(model.getDescription().trim());
}
else
{
model.setDescription("");
}
//记录每个别名及其准确度<Sysnonym,1>表时Synonym可靠性很大
Map<String,Integer> SynonymMap = new HashMap<String,Integer>();
//最终返回的可以确定正确的同义词列表
List<String> synonymsList = new ArrayList<String>();
String description = model.getDescription();
String prjName = model.getName();
try
{
//github项目名称中有斜杠提取斜杠后内容做为别名
String subName = getSubName(prjName);
if(subName != null)
{
SynonymMap.put(subName, 1);
}
//将subName中短横线下划线用空格代替以及将短横线下划线删除后获取的字符串当做别名
String strReplace = ReplaceSpecial(subName);
if(!strReplace.equals(subName))
{
synonymsList.add(strReplace);
}
//删除特殊符号如短横线下划线,如果其包含在description中则做为别名
String strDelete = DeleteSpecial(subName);
if(!strDelete.equals(subName))
{
if(description.toLowerCase().contains(strDelete.toLowerCase()))
{
synonymsList.add(strDelete);
}
}
if(description == null)
{
//如果描述信息为空别名就只包括已经提取出的和从名字中获取的别名
if(synonymsList != null)
for(String sure:synonymsList)
SynonymMap.put(sure, 1);
return SynonymMap;
}
//描述信息非空时尝试从description中提取别名
Map<String, Integer> synonymsFromDesc = getSynonymsFromDescOfGitprj(description,model.getName());
//将别名连同其确定性一起放入SynonymMap中
for(String tmpSyn:synonymsList)
SynonymMap.put(tmpSyn, 1);
//将从描述信息中获取的别名及其确定性放入SynonymMap
if(synonymsFromDesc != null)
for(String key:synonymsFromDesc.keySet())
SynonymMap.put(key, synonymsFromDesc.get(key));
return SynonymMap;
}catch(Exception e)
{
e.printStackTrace();
logger.error("Error while extracting synonyms from project:" + model.getId());
System.exit(0);
}finally
{
return SynonymMap;
}
}
private Map<String, Integer> getSynonymsNotGitPrj(GatherProjectsModel model) {
model.setName(model.getName().trim());
if(model.getDescription() != null)
model.setDescription(model.getDescription().trim());
List<String> synonymsList = new ArrayList<String>();//能够确定正确的别名
List<String> synonymsList_notsure = new ArrayList<String>();//不能百分百确定的别名
String description = model.getDescription();
String prjName = model.getName();
List<String> result = null;
if (description == null) {
if(description == null)
{
//如果description为null则别名无需再做改动直接返回null
return null;
}
List<String> linkNameList = new ArrayList<String>();// 用于存储所有提取出来的关联项目名
List<String> keywords = FileReader.read("./files/language.txt");
// 对项目名进行处理,去除公司名或基金组织的信息&&去除括号没有公司没有括号则包含自己
String projectNameWithoutBracket = StringHandler_ProjectName
.removeBracket(prjName);
String projectNameWithoutComName = StringHandler_ProjectName
.getProjectWithoutComName(projectNameWithoutBracket);
// String projectNameWithoutComName =
// StringHandler_ProjectName.getProjectWithoutComName(prjName);
// projectNameWithoutComName =
// StringHandler_ProjectName.removeBracket(projectNameWithoutComName);
if (!"".equals(projectNameWithoutComName)) {
List<String> linkNameList = new ArrayList<String>();//用于存储所有提取出来的可能是别名的词
List<String> keywords = FileReader.read("./files/language.txt");//用于过滤类似于"Java","C++"这样的词
projectNameWithoutBracket = StringHandler_ProjectName.removeBracket(prjName);
projectNameWithoutComName = StringHandler_ProjectName.getProjectWithoutComName(projectNameWithoutBracket);
//提取项目名括号中的信息不是项目类型
bracket = StringHandler_ProjectName.getBracket(prjName);
//去除括号及括号内容后即可做为别名
if(!"".equals(projectNameWithoutComName))
{
synonymsList.add(projectNameWithoutComName);
// logger.info("gets synonyms:" + projectNameWithoutComName +
// " Without ComName and remove racket");
logger.info("gets synonyms:" + projectNameWithoutComName + " Without ComName and remove racket");
}
// 提取项目名括号中的信息不是项目类型
String bracket = StringHandler_ProjectName.getBracket(prjName);
if (bracket != null && !"".equals(bracket)) {
bracket = RegexHandler.extractEngDecChiAndDot(bracket);// 去除括号中信息的特殊符号
if (!RegexHandler.onlySpecialChar(bracket)) {
if (!keywords.contains(bracket.toLowerCase())) {// 括号中的信息不是语言类关键字词
if (StringHandler.isFullOrShort(bracket,
projectNameWithoutBracket)) {
synonymsList.add(bracket);
} else {
if (StringHandler.specialIsFullOrShort(bracket,
projectNameWithoutBracket))
try
{
//TODO 将对项目名称的处理过程独立为一个函数
getSynonymsFromName(synonymsList,linkNameList,keywords);
if(description != null)
getSynonymsFromDescOfNotGitprj(model,description,prjName,synonymsList,linkNameList,keywords);
//result包含去掉公司信息括号内的be动词之前的
result = removeDupStr(linkNameList);
//TODO 将从linkName中提取全/缩写独立出来
getSynonymsByFullOrShort(result,synonymsList);
//TODO 将从linkName中依据相似度提取别名的程序独立出来
getSynonymsBySimilarDegree(model.getName(),result,synonymsList_notsure,keywords,"notGitPrj");
}catch(Exception e)
{
logger.error("Error while extracting synonym from prj:" + model.getId() + ":" +e);
}
return afterExtract(synonymsList, synonymsList_notsure);
}
public void getSynonymsFromName(List<String> synonymsList,List<String> linkNameList,List<String> keywords)
{
if(bracket != null && projectNameWithoutBracket != null && projectNameWithoutComName != null)
{
// Map<String,Integer> SynonymMap = new HashMap<String,Integer>();
if(!"".equals(bracket))
{
bracket = RegexHandler.extractEngDecChiAndDot(bracket);//去除括号中信息的特殊符号
if(!RegexHandler.onlySpecialChar(bracket))
{
if(!keywords.contains(bracket.toLowerCase()))//括号中的信息不是语言类关键字词
{
//如果括号中内容个项目名存在全/缩写关系
if(StringHandler.isFullOrShort(bracket, projectNameWithoutBracket))
{
synonymsList.add(bracket);
//SynonymMap.put(bracket, 1);
}
else
linkNameList.add(bracket);
{
if(StringHandler.specialIsFullOrShort(bracket, projectNameWithoutBracket))
synonymsList.add(bracket);
else linkNameList.add(bracket);
}
}
}
}
}
if (description != null) {
List<String> beWords = FileReader.read("./files/beVerb.txt");// 读取be动词表
description = StringHandler.getFirstSentence(description);
for (String beWord : beWords) {
String linkName = StringHandler.findLinkName(description,
beWord);// 按系动词表优先级进行匹配
if (linkName == null) {// 表示没有匹配到当前的系动词 进行下一个匹配
continue;
}
if (linkName.toLowerCase().equals(prjName.toLowerCase()))
continue;
// 特征短语去噪
List<String> linkNameResult = EDDHandler
.getMainAndViceDescriptionByBracket(linkName);// 提取主副描述信息(括号中)
linkNameResult = EDDHandler
.getMainAndViceDescriptionByKnowAs(linkNameResult);// 提取主副描述信息(known
// as+连词)
linkNameResult = EDDHandler.removeComma(linkNameResult);// 去除定语描述信息(逗号定语)
linkNameResult = EDDHandler
.removeDemonstrativeWords(linkNameResult);// 去除定语描述信息(定冠词The)
linkNameResult = EDDHandler
.removeArrtibutiveClause(linkNameResult);// 去除从句描述信息
linkNameResult = EDDHandler.removePronoun(linkNameResult);// 去除指示代词
linkNameResult = EDDHandler.removeProject(linkNameResult);// 去除项目常用词
linkNameResult = EDDHandler.removePPCombine(linkNameResult);// 去除指示代词和项目常用词两者的笛卡尔积
linkNameResult = RegexHandler
.extractEngDecChiAndDot(linkNameResult);// 提取项目中
linkNameResult = RegexHandler.removeOnlySpecial(linkNameResult);// 删除只有特殊字符的同义词
for (String name : linkNameResult) {
// logger.info("be 动词之前的有: "+name);
// be动词前不包括项目类型且和项目名不相同
if (!keywords.contains(name.toLowerCase().trim())
&& !name.toLowerCase().trim()
.equals(prjName.toLowerCase().trim())
&& !name.toLowerCase()
.trim()
.equals(projectNameWithoutComName
.toLowerCase().trim())) {
// 这里的prjName是完整的name
if (prjName.toLowerCase().indexOf(name.toLowerCase()) != -1) {
String withoutShortDash = RemoveShortDash(name);
if (bracket == null || bracket.equals(""))
continue;
if (name.toLowerCase()
.equals(bracket.toLowerCase())
|| name.toLowerCase().equals(
withoutShortDash))
synonymsList.add(name);
else {
int nameLength1 = withoutShortDash.split(" ").length;
int nameLength2 = projectNameWithoutBracket
.split(" ").length;
int nameLength = name.trim().split(" ").length;
double ratio1 = (nameLength1 / nameLength) * 1.0;
double ratio2 = (nameLength2 / nameLength) * 1.0;
if (nameLength1 > 1 && ratio1 > 0.6)
synonymsList.add(name);
else {
if (nameLength2 > 1 && ratio2 > 0.6)
synonymsList.add(name);
else
linkNameList.add(name);
}
}
// synonymsList.add(name);//问题出在这个地方如果从discription中提取出的词在名字中有出现就直接加入
// 会导致比如Ajax被识别为Ajax Implement的别名而实际上并不是这样的
// logger.info("gets synonyms:" + name +
// " from description before be verb");
} else
linkNameList.add(name);
}
}
}
}
// result包含去掉公司信息括号内的be动词之前的
result = removeDupStr(linkNameList);
// logger.info("result去掉公司信息、括号内的、be动词之前的;");
// for (int i = 0; i < result.size(); i++)
// System.out.println("result: " + result.get(i));
// 去除重复的
String fullName = StringHandler.getFullName(
projectNameWithoutComName.toLowerCase(), result);
if (!fullName.equals("")) {
synonymsList.add(fullName.toLowerCase());// 表示项目名确实是缩写 并提取到了项目全名
// logger.info("项目的全称为: "+fullName);
}
String shortName = StringHandler.getShortName(
projectNameWithoutComName.toLowerCase(), result);
if (!shortName.equals("")) {
synonymsList.add(shortName.toLowerCase());// 表示项目名确实是全称 并提取到了项目缩写
// logger.info("项目的简称为: "+shortName);
}
// 同义词要和项目名称有一定的相似度
for (int i = 0; i < result.size(); i++) {
}
public void getSynonymsBySimilarDegree(String Name,List<String> result,List<String> synonymsList_notsure,List<String> keywords,String prjSource)
{
//同义词要和项目名称有一定的相似度
for(int i = 0; i < result.size(); i++){
String extract = "";
String synonym = result.get(i);
String[] words = synonym.split(" ");// 按照空格进行分词
for (String word : words) {
if (model.getName().toLowerCase().indexOf(word.toLowerCase()) >= 0) {// 查看每个词在项目名中是否出现
String[] words = synonym.split(" ");//按照空格进行分词
for(String word:words){
if(Name.toLowerCase().indexOf(word.toLowerCase()) >= 0){//查看每个词在项目名中是否出现
extract += word + " ";
}
}
if (extract.length() != 0)
if(extract.length() !=0)
extract = extract.substring(0, extract.length() - 1);
// logger.info("extract : "+extract);
if (!"".equals(extract) && !keywords.contains(extract)) {
// 提取的别名不和项目名重复
if (!extract.toLowerCase().equals(prjName.toLowerCase().trim())
&& !extract
.toLowerCase()
.trim()
.equals(projectNameWithoutComName.toLowerCase()
.trim())) {
double similarDegree = SimilarDegree(extract,
projectNameWithoutComName);
// System.out.println("字符串间的相似度为: "+similarDegree);
if (similarDegree >= 0.6 && similarDegree < 1.0) {
// logger.info("from extract after similar "+extract);
//logger.info("extract : "+extract);
if(!"".equals(extract) && !keywords.contains(extract))
{
boolean AbleToCalcSimilar = false;
if(prjSource.equals("github"))
{
String strReplace = ReplaceSpecial(getSubName(Name));
if(!extract.toLowerCase().equals(Name.toLowerCase().trim()) && !extract.toLowerCase().trim().equals(strReplace.toLowerCase().trim()))
AbleToCalcSimilar = true;
}
if(!prjSource.equals("notGitPrj"))
{
if(Name!= null && projectNameWithoutBracket != null)
if(!extract.toLowerCase().equals(Name.toLowerCase().trim()) && !extract.toLowerCase().trim().equals(projectNameWithoutComName.toLowerCase().trim()))
AbleToCalcSimilar = true;
}
//提取的别名不和项目名重复
if(AbleToCalcSimilar)
{
double similarDegree = 0.0;
if(!prjSource.equals("github")) similarDegree= SimilarDegree(extract,projectNameWithoutComName);
if(prjSource.equals("github")) similarDegree = SimilarDegree(extract, Name);
if(similarDegree >= 0.6 && similarDegree <1.0)
{
logger.info("from extract after similar "+extract);
synonymsList_notsure.add(extract);
}
}
}
}
removeDupStr(synonymsList);// 去除同名列表中重复的
removeDupStr(synonymsList_notsure);
for (String s : synonymsList) {
// logger.info("finally synonyms:" + s + " by description");
}
public void getSynonymsByFullOrShort(List<String> result,List<String>synonymsList)
{
String fullName = StringHandler.getFullName(projectNameWithoutComName.toLowerCase(), result);
if(!fullName.equals("")){
synonymsList.add(fullName.toLowerCase());//表示项目名确实是缩写 并提取到了项目全名
}
String shortName = StringHandler.getShortName(projectNameWithoutComName.toLowerCase(), result);
if(!shortName.equals("")){
synonymsList.add(shortName.toLowerCase());//表示项目名确实是全称 并提取到了项目缩写
}
}
public void getSynonymsFromDescOfNotGitprj(GatherProjectsModel model,String description,String prjName,List<String> synonymsList,List<String> linkNameList,List<String> keywords)
{
for (String sure : synonymsList)
SynonymMap.put(sure, 1);
for (String Notsure : synonymsList_notsure)
SynonymMap.put(Notsure, 0);
return SynonymMap;
List<String> beWords = FileReader.read("./files/beVerb.txt");//读取be动词表
//TODO 将对description的分词处理部分独立出来
description = StringHandler.getFirstSentence(description);
for(String beWord:beWords){
String linkName = StringHandler.findLinkName(description, beWord);//按系动词表优先级进行匹配
if(linkName == null){//表示没有匹配到当前的系动词 进行下一个匹配
continue;
}
if(linkName.toLowerCase().equals(prjName.toLowerCase()))
continue;
//特征短语去噪
List<String> linkNameResult =getLinkNameResult(linkName);
handleLinkNameResult(model,linkNameResult,synonymsList,linkNameList,keywords);
}
}
public Map<String, Integer> getSynonymsFromDescOfGitprj(String description,String prjName)
{
List<String> keywords = FileReader.read("./files/language.txt");
List<String> beWords = FileReader.read("./files/beVerb.txt");
//程序中只使用description的第一句进行处理
String descriptionOfFirstSentence = StringHandler.getFirstSentence(description);
//获取斜杠后名字
String subName = getSubName(prjName);
//将subName中的特殊符号用空格代替
String strReplace = ReplaceSpecial(subName);
//将subName中的特殊符号直接删除
String strDelete = DeleteSpecial(subName);
List<String> linkNameList = new ArrayList<String>();
List<String> synonymsList = new ArrayList<String>();//确定正确的别名
List<String> synonymsList_not_sure = new ArrayList<String>();//不确定是否正确但是满足提取规则的别名
// int matchBewords = 0;//指示description中是否有be动词为0则没有匹配到任何be动词大于1则说明有匹配到be动词
List<String> linkNameResult = new ArrayList<String>();//存放description分词后的处理结果
int length = beWords.size();
for(int k=0;k<length;k++)
{
//按系动词表优先级进行匹配
String linkName = StringHandler.findLinkName(descriptionOfFirstSentence, beWords.get(k));
if(linkName == null)
{
if(k < length-1)
continue;
else
{
if(description.length()<100)
linkName = description;
else linkName = description.substring(0, 100);
}
}
//如果提取出的和subName相等就不再进行接下来的判断这在github中普遍存在
if(linkName.toLowerCase().equals(subName.toLowerCase()))
continue;
//对匹配到be动词的部分进行分词处理
linkNameResult = getLinkNameResult(linkName);
if(linkNameResult != null)
{
//将从description中提取出的各个单词组合成长字符串
String strFromDesc="";
for(String name:linkNameResult)
{
logger.info("be 动词之前的有: "+name);
if(!keywords.contains(name.toLowerCase().trim()) && !name.toLowerCase().trim().equals(subName.toLowerCase().trim()) )
{
if(DeleteSpecial(name).equals(strDelete) && DeleteSpecial(name)!= null)
{
synonymsList.add(name);
}
else
{
//提取全缩写形式的别名
String synonymsFromGitPrj = getSynonymByFullOrShortOfGitPrj(strReplace.toLowerCase(),strDelete.toLowerCase(),name.toLowerCase());
if(synonymsFromGitPrj != null)
{
synonymsList.add(synonymsFromGitPrj);
}
else
{
linkNameList.add(name.toLowerCase());
}
}
}
//将经过处理的description再组装成字符串
strFromDesc = strFromDesc + name + " ";
}
strFromDesc = strFromDesc.trim();
}
}
List<String>result = removeDupStr(linkNameList);
getSynonymsBySimilarDegree(subName, result, synonymsList_not_sure, keywords,"github");
return afterExtract(synonymsList,synonymsList_not_sure);
}
public List<String> getLinkNameResult(String linkName)
{
//特征短语去噪
List<String> linkNameResult = EDDHandler.getMainAndViceDescriptionByBracket(linkName);//提取主副描述信息(括号中)
linkNameResult = EDDHandler.getMainAndViceDescriptionByKnowAs(linkNameResult);//提取主副描述信息(known as+连词)
linkNameResult = EDDHandler.removeComma(linkNameResult);//去除定语描述信息(逗号定语)
linkNameResult = EDDHandler.removeDemonstrativeWords(linkNameResult);//去除定语描述信息(定冠词The)
linkNameResult = EDDHandler.removeArrtibutiveClause(linkNameResult);//去除从句描述信息
linkNameResult = EDDHandler.removePronoun(linkNameResult);//去除指示代词
linkNameResult = EDDHandler.removeProject(linkNameResult);//去除项目常用词
linkNameResult = EDDHandler.removePPCombine(linkNameResult);//去除指示代词和项目常用词两者的笛卡尔积
linkNameResult = RegexHandler.extractEngDecChiAndDot(linkNameResult);//提取项目中
linkNameResult = RegexHandler.removeOnlySpecial(linkNameResult);//删除只有特殊字符的同义词
return linkNameResult;
}
public void handleLinkNameResult(GatherProjectsModel model,List<String> linkNameResult,List<String> synonymsList,List<String>linkNameList,List<String>keywords)
{
for(String name:linkNameResult){
logger.info("be 动词之前的有: "+name);
//be动词前不包括项目类型且和项目名不相同
if(!keywords.contains(name.toLowerCase().trim()) && !name.toLowerCase().trim().equals(model.getName().toLowerCase().trim())
&& !name.toLowerCase().trim().equals(projectNameWithoutComName.toLowerCase().trim())){
//这里的prjName是完整的name
if(model.getName().toLowerCase().indexOf(name.toLowerCase()) != -1){
String withoutShortDash = RemoveShortDash(name);
if(name.toLowerCase().equals(bracket.toLowerCase()) || name.toLowerCase().equals(withoutShortDash))
synonymsList.add(name);
else
{
int nameLength1 = withoutShortDash.split(" ").length;
int nameLength2 = projectNameWithoutBracket.split(" ").length;
int nameLength = name.trim().split(" ").length;
double ratio1= (nameLength1/nameLength) * 1.0;
double ratio2 =(nameLength2/nameLength) * 1.0;
if(nameLength1 > 1 && ratio1 > 0.6)
synonymsList.add(name);
else
{
if(nameLength2 > 1 && ratio2 > 0.6)
synonymsList.add(name);
else linkNameList.add(name);
}
}
}
else
linkNameList.add(name);
}
}
}
//取斜杠后面的字符串如果没有斜杠就返回字符串本身
private String getSubName(String prjName) {
// TODO Auto-generated method stub
String SubName = prjName;
if(prjName.indexOf('/') == -1)
return SubName;
else
SubName = prjName.substring(prjName.indexOf('/'));
return SubName;
}
public static String DeleteSpecial(String subName) {
// TODO Auto-generated method stub
int deleteNum=0;
StringBuffer str=new StringBuffer(subName);
for(int i=0;i< subName.length();i++)
{
//去除下划线
if(subName.charAt(i) == '_')
{
str.deleteCharAt(i-deleteNum);
deleteNum++;
}
//去除短横线
if(subName.charAt(i) == '-')
{
//System.out.println("字符位置在" + i + " and 字符是" + subName.charAt(i));
str.deleteCharAt(i-deleteNum);
deleteNum++;
}
if(subName.charAt(i) == '.')
{
//System.out.println("字符位置在" + i + " and 字符是" + subName.charAt(i));
str.deleteCharAt(i-deleteNum);
deleteNum++;
}
if(subName.charAt(i)== '/')
{
//System.out.println("字符位置在" + i + " and 字符是" + subName.charAt(i));
str.deleteCharAt(i-deleteNum);
deleteNum++;
}
}
return str.toString();
}
public static String ReplaceSpecial(String subName) {
// TODO Auto-generated method stub
StringBuffer str=new StringBuffer(subName);
for(int i=0;i<subName.length();i++)
{
if(subName.charAt(i)== '-')
str.setCharAt(i, ' ');
if(subName.charAt(i) == '_')
str.setCharAt(i, ' ');
}
return str.toString();
}
//将提取出的一定正确的别名及不确定是否正确的别名和其精确度对应起来组装到Map中
public Map<String,Integer>afterExtract(List<String> synonymsList,List<String>synonymsList_not_sure)
{
Map<String,Integer> SynonymMap = new HashMap<String,Integer>();
if(synonymsList != null)
{
synonymsList = removeDupStr(synonymsList);
if(synonymsList != null)
for(String sure:synonymsList)
{
SynonymMap.put(sure, 1);
}
}
if(synonymsList_not_sure != null)
{
synonymsList_not_sure = removeDupStr(synonymsList_not_sure);
for(String Notsure:synonymsList_not_sure)
{
SynonymMap.put(Notsure, 0);
}
}
return SynonymMap;
}
public String getSynonymByFullOrShortOfGitPrj(String subName,String subName2,String descWithoutBracketHandled)
{
if(StringHandler.isFullOrShort(ReplaceSpecial(descWithoutBracketHandled).toLowerCase(), subName.toLowerCase()) ||
StringHandler.specialIsFullOrShortForGitPrj(ReplaceSpecial(descWithoutBracketHandled).toLowerCase(), subName.toLowerCase()))
{
return descWithoutBracketHandled;
//System.out.println("ReplaceSpecial提取到别名" + descWithoutBracketHandled);
}
// if(StringHandler.isFullOrShort(DeleteSpecial(descWithoutBracketHandled).toLowerCase(), subName2.toLowerCase()) ||
// StringHandler.specialIsFullOrShortForGitPrj(DeleteSpecial(descWithoutBracketHandled).toLowerCase(), subName2.toLowerCase()))
// {
// return descWithoutBracketHandled;
// //System.out.println("DeleteSpecial提取到别名" + descWithoutBracketHandled);
// }
return null;
}
public static String RemoveShortDash(String name) {
String withoutShortDash = "";
if (name.indexOf('-') > -1)

View File

@ -4,6 +4,7 @@ import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Set;
import javax.annotation.Resource;
@ -46,13 +47,15 @@ public class TestDate {
}
public void start_test(){
String homepage = "http://github.com/apache/abdera;http://incubator.apache.org/abdera/";
String homepage1 = "http://incubator.apache.org/abdera/";
String arr[] = homepage1.split(";");
System.out.println(arr.length);
for(String a : arr){
System.out.println(a);
}
// String homepage = "http://github.com/apache/abdera;http://incubator.apache.org/abdera/";
// String homepage1 = "http://incubator.apache.org/abdera/";
// String arr[] = homepage1.split(";");
// System.out.println(arr.length);
// for(String a : arr){
// System.out.println(a);
// }
Set<Integer> set = dbDest.selectOpenSourceProjectsIds("gather_projects_test");
System.out.println(set.size());

View File

@ -67,7 +67,7 @@ public class TransferProjects {
}
public void start(){
boolean isUpdate = false;
haveTransferSet = new HashSet<Integer>();
haveTransferSet = dbDest.selectOpenSourceProjectsIds(targetTableName);
int delCount = 0;
@ -85,6 +85,7 @@ public class TransferProjects {
}
}
for(EddRelations relation:eddRelationList){
boolean isUpdate = false;
String gather_projects_ids = relation.getGather_projects_ids();
try {
gather_projects_ids = gather_projects_ids.substring(1, gather_projects_ids.length() - 1);

View File

@ -208,7 +208,7 @@ public class MergeProjectNew2 {
matchedIdList.add(uniqueModel.getId());
}
for (GatherProjectsModel tmp : sameHomeMap.get(uniqueSource)) {
if (!tmp.equals(uniqueModel) && tmp.getUpdate_mark() != 2) {
if (tmp.getId() != uniqueModel.getId() && tmp.getUpdate_mark() != 2) {
// 插入关系表
String relationStr = "," + tmp.getId() + ",";
dbSource.insertEddRelations(eddRelationTableName,
@ -319,7 +319,7 @@ public class MergeProjectNew2 {
GatherProjectsModel uniqueModel = getUniqueSourceModel(map
.get(uniqueSource));
for (GatherProjectsModel tmp : map.get(uniqueSource)) {
if (!tmp.equals(uniqueModel)) {
if (tmp.getId()!= uniqueModel.getId()) {
// 插入关系表
String tmpStr = "," + tmp.getId() + ",";
if (tmp.getUpdate_mark() != 2) {
@ -439,7 +439,7 @@ public class MergeProjectNew2 {
uniqueModel = getUniqueSourceModel(map.get(uniqueSource));
for (GatherProjectsModel tmp : map.get(uniqueSource)) {
if (!tmp.equals(uniqueModel) && tmp.getUpdate_mark() != 2) {
if (tmp.getId() != uniqueModel.getId() && tmp.getUpdate_mark() != 2) {
// 插入关系表
String relationStr = "," + tmp.getId() + ",";
dbSource.insertEddRelations(eddRelationTableName,
@ -774,7 +774,7 @@ public class MergeProjectNew2 {
public void test() {
GatherProjectsModel model = gatherDao.selectGPMById(
gatherProjectsTableName, 1);
gatherProjectsTableName, 543);
handleNewProject(model, false);
}
}

View File

@ -50,7 +50,6 @@ public class StringHandler {
}
}
resultStrArr = tmp.toArray(new String[0]);
//System.out.println("bangbangbang"+resultStrArr[0]+" "+resultStrArr[1]);
return resultStrArr;
}
@ -289,13 +288,18 @@ public class StringHandler {
}
public static boolean getFullName(String shortName, String synonym,int a){
//查看当前的同义词是不是含不含有全称
String[] words = synonym.toLowerCase().split(" ");
String shortenWord = "";
for(int i = 0; i < words.length; i++){
String word = words[i];
String tmp = word.substring(0, 1);//取出第一个英文字符
String tmp = "";
if(word.length() == 1)
tmp = word;
else
if(word.length() > 0)
tmp = word.substring(0, 1);//取出第一个英文字符
shortenWord += tmp;
}
@ -358,26 +362,22 @@ public class StringHandler {
public static boolean specialIsFullOrShort(String shortName, String fullName)
{
//System.out.println("进入了特殊别名提取函数");
//首先获取两个字符串中单词的
String[] words_fullName= fullName.toLowerCase().split(" ");
String[] words_shortName = shortName.toLowerCase().split(" ");
if(words_fullName.length==1 && words_shortName.length==1)
{
//System.out.println("两者长度都大于1");
return false;
}
if(words_shortName.length == 1 && words_fullName.length > 1 )
{
//System.out.println("fullName的长度大于1");
if(specialIsFullOrShorDetail(shortName,fullName))
return true;
}
else
{
//System.out.println("shortName长度大于1");
if(specialIsFullOrShorDetail(fullName,shortName))
return true;
}
@ -397,10 +397,15 @@ public class StringHandler {
//要求全写中各个单词的首字母必须在缩写中出现而且是顺序出现或最多有一个不出现在缩写中
for(int i = 0; i < words_fullName.length; i++){
String word = words_fullName[i].toLowerCase();
String tmp = word.substring(0, 1);//取出第一个英文字符
String tmp = "";//取出第一个英文字符
if(word.length() == 1)
tmp = word;
else
if(word.length() > 0)
tmp = word.substring(0, 1);//取出第一个英文字符
if(shortName.toLowerCase().contains(tmp))
{
//System.out.println("进入了详细字符串比对函数");
// tmpFlag = shortName.indexOf(tmp);
// if(tmpFlag > positionFlag)
// {
@ -412,11 +417,10 @@ public class StringHandler {
/**
如果满足全写中各个单词的首字母都在缩写中顺序出现就接着
判断缩写中所有字幕是否都在写中出现
判断缩写中所有字幕是否都在写中出现
*/
if(matchCount > words_fullName.length-2)
{
//System.out.println("匹配个数与全写中单词个数相等");
if(isAllContained(shortName,fullName))
return true;
}
@ -540,4 +544,50 @@ public class StringHandler {
return shortName;//表示描述信息中出现了缩写形式
return "";
}
public static boolean specialIsFullOrShortForGitPrj(String shortName, String fullName)
{
String[] splitFullName = null;
int containedNum = 0;//记录shortName中包含在fullName中的单词的个数
if(fullName.indexOf("_") > -1)
splitFullName = fullName.split("-");
if(fullName.indexOf("-") > -1)
splitFullName = fullName.split("_");
else
splitFullName = fullName.split(" ");
if(splitFullName != null)
for(String tmp:splitFullName)
if(shortName.contains(tmp))
containedNum++;
if(containedNum > 2)
return false;
if(fullName.contains(shortName) || shortName.contains(fullName))
return false;
//首先获取两个字符串中单词的
String[] words_fullName= fullName.toLowerCase().split(" ");
String[] words_shortName = shortName.toLowerCase().split(" ");
if(words_fullName.length==1 && words_shortName.length==1)
return false;
if(words_shortName.length == 1 && words_fullName.length > 1 )
{
if(specialIsFullOrShorDetail(shortName,fullName))
{
return true;
}
}
else
{
if(specialIsFullOrShorDetail(fullName,shortName))
{
return true;
}
}
return false;
}
}

View File

@ -16,6 +16,7 @@ import javax.annotation.Resource;
import org.apache.log4j.Logger;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Qualifier;
import org.springframework.dao.DuplicateKeyException;
import org.springframework.stereotype.Component;
import org.springframework.transaction.annotation.Propagation;
import org.springframework.transaction.annotation.Transactional;
@ -202,13 +203,24 @@ public class TransferProjectsUtil2 {
//logger.info("!!!!!!!!"+gather_projects_ids);
//logger.info(osp.getId());
OpenSourceProject formerOsp = getFormerOsp(gather_projects_ids);
try{
if(formerOsp.getId() != osp.getId() && formerOsp.getFiltration()==2)
osp.setFiltration(1);
}catch(Exception e){
System.out.println(gather_projects_ids);
}
dbDest.deleteOpenSourceProjectsItem(openSourceProjectTableName,formerOsp.getId());
dbSource.deleteTaggingsByTaggableId(taggingTableName, formerOsp.getId()); //由于没有级联删除 因此需要手动删除
dbDest.deleteMatchResult(formerOsp.getId(), getTargetTable(formerOsp.getId()));
}
//try{
dbDest.insertOsp(openSourceProjectTableName, osp);//将数据插入open_source_projects表中
//}catch (DuplicateKeyException e){
// System.out.println(isUpdate);
// System.out.println(e);
//}
int osp_id = osp.getId();
String tags = osp.getTags();//tag分离
if(tags != null){//表示该项目没有标签
@ -422,8 +434,14 @@ public class TransferProjectsUtil2 {
* 如果全部相同 则不进行后续操作
*/
public OpenSourceProject getFormerOsp(String gather_projects_ids){
String[] idArray = gather_projects_ids.split(",");
OpenSourceProject osp_former = null;
if(gather_projects_ids.indexOf(",")<0){
osp_former = dbDest.selectOpenSourceProjectsItem(openSourceProjectTableName, Integer.parseInt(gather_projects_ids));
return osp_former;
}
String[] idArray = gather_projects_ids.split(",");
for(String id_str:idArray){
//查找对应osp记录
int id = Integer.parseInt(id_str);

View File

@ -19,9 +19,9 @@
destroy-method="close">
<property name="driverClassName" value="com.mysql.jdbc.Driver" />
<property name="url"
value="jdbc:mysql://localhost/ossean?characterEncoding=UTF-8&amp;zeroDateTimeBehavior=convertToNull&amp;autoReconnect=true" />
value="jdbc:mysql://localhost/ossean_production?characterEncoding=UTF-8&amp;zeroDateTimeBehavior=convertToNull&amp;autoReconnect=true" />
<property name="username" value="root" />
<property name="password" value="123456" />
<property name="password" value="1234" />
<property name="validationQuery" value="SELECT 1" />
<property name="testOnBorrow" value="true"/>
</bean>
@ -38,9 +38,9 @@
destroy-method="close">
<property name="driverClassName" value="com.mysql.jdbc.Driver" />
<property name="url"
value="jdbc:mysql://localhost:3306/ossean?characterEncoding=UTF-8&amp;zeroDateTimeBehavior=convertToNull&amp;autoReconnect=true" />
value="jdbc:mysql://localhost:3306/ossean_production?characterEncoding=UTF-8&amp;zeroDateTimeBehavior=convertToNull&amp;autoReconnect=true" />
<property name="username" value="root" />
<property name="password" value="123456" />
<property name="password" value="1234" />
<property name="validationQuery" value="SELECT 1" />
<property name="testOnBorrow" value="true"/>
</bean>