Skip to content

Commit fd84f19

Browse files
Ahad RanaAhad Rana
Ahad Rana
authored and
Ahad Rana
committed
Initial checkin of the cc repository
0 parents  commit fd84f19

File tree

307 files changed

+197913
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

307 files changed

+197913
-0
lines changed

README

Whitespace-only changes.

bin/launcher.sh

+151
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
#!/usr/bin/env bash
2+
3+
# resolve links - $0 may be a softlink
4+
5+
this="$0"
6+
while [ -h "$this" ]; do
7+
ls=`ls -ld "$this"`
8+
link=`expr "$ls" : '.*-> \(.*\)$'`
9+
if expr "$link" : '.*/.*' > /dev/null; then
10+
this="$link"
11+
else
12+
this=`dirname "$this"`/"$link"
13+
fi
14+
done
15+
16+
17+
# convert relative path to absolute path
18+
bin=`dirname "$this"`
19+
script=`basename "$this"`
20+
bin=`cd "$bin"; pwd`
21+
this="$bin/$script"
22+
23+
# the root of the app installation
24+
export CCAPP_HOME=`dirname "$this"`/..
25+
export CCAPP_CONF_DIR=$CCAPP_HOME/conf
26+
export CCAPP_LOG_DIR=$CCAPP_HOME/logs
27+
export CCAPP_LIB_DIR=$CCAPP_HOME/lib
28+
29+
echo "CCAPP_HOME:"$CCAPP_HOME
30+
echo "CCAPP_CONF_DIR:$CCAPP_CONF_DIR"
31+
echo "CCAPP_LOG_DIR:$CCAPP_LOG_DIR"
32+
33+
if ! [ -e $CCAPP_HOME/build/commoncrawl-*.jar ]; then
34+
echo "Please build commoncrawl jar"
35+
else
36+
CCAPP_JAR=`basename $CCAPP_HOME/build/commoncrawl*.jar`
37+
CCAPP_JAR_PATH=$CCAPP_HOME/build
38+
echo "CCAPP_JAR:"$CCAPP_JAR
39+
echo "CCAPP_JAR_PATH:"$CCAPP_JAR_PATH
40+
fi
41+
42+
if [ "$JAVA_HOME" = "" ]; then
43+
echo "Error: JAVA_HOME is not set."
44+
exit 1
45+
else
46+
echo "JAVA_HOME:$JAVA_HOME"
47+
fi
48+
49+
50+
if [ "$HADOOP_HOME" = "" ]; then
51+
echo "HADOOP_HOME not defined. Attempting to locate via ~/build.properties"
52+
HADOOP_HOME=`cat ~/build.properties | grep "hadoop.path" | sed 's/.*=\(.*\)$/\1/'`
53+
54+
if ! [ "$HADOOP_HOME" = "" ]; then
55+
echo "Derived HADOOP_HOME from build.properties to be:$HADOOP_HOME"
56+
else
57+
echo "Failed to extract HADOOP_HOME from build.properties. Please set HADOOP_HOME to point to Hadoop Distribution"
58+
exit 1
59+
fi
60+
fi
61+
62+
# Try to locate hadoop home if not set ...
63+
if [ -z $HADOOP_HOME/build/hadoop-*-core.jar ]; then
64+
HADOOP_JAR=`ls $HADOOP_HOME/build/hadoop-*-core.jar`
65+
else
66+
HADOOP_JAR=`ls $HADOOP_HOME/hadoop-*-core.jar`
67+
fi
68+
69+
if [ "$HADOOP_CONF_DIR" = "" ]; then
70+
HADOOP_CONF_DIR="$HADOOP_HOME/conf"
71+
fi
72+
73+
echo "HADOOP_JAR:$HADOOP_JAR"
74+
echo "HADOOP_CONF_DIR:$HADOOP_CONF_DIR"
75+
76+
# CLASSPATH initially contains CCAPP_CONF:HADOOP_CONF_DIR
77+
CLASSPATH=${CCAPP_CONF_DIR}
78+
CLASSPATH=${CLASSPATH}:${HADOOP_CONF_DIR}
79+
# and add in test path ...
80+
CLASSPATH=${CLASSPATH}:${CCAPP_HOME}/tests
81+
# next add tools.jar
82+
CLASSPATH=${CLASSPATH}:$JAVA_HOME/lib/tools.jar
83+
# next add commoncrawl jar FIRST ...
84+
CLASSPATH=${CLASSPATH}:${CCAPP_JAR_PATH}/${CCAPP_JAR}
85+
# then add nested libraries in commoncrawl jar
86+
for f in ${CCAPP_HOME}/lib//*.jar; do
87+
CLASSPATH=${CLASSPATH}:$f;
88+
done
89+
#next add hadoop jar path
90+
CLASSPATH=${CLASSPATH}:${HADOOP_JAR}
91+
# add hadoop libs to CLASSPATH
92+
for f in $HADOOP_HOME/lib/*.jar; do
93+
CLASSPATH=${CLASSPATH}:$f;
94+
done
95+
# and add jetty libs ...
96+
for f in $HADOOP_HOME/lib/jetty-ext/*.jar; do
97+
CLASSPATH=${CLASSPATH}:$f;
98+
done
99+
100+
echo "";
101+
echo "CLASSPATH:$CLASSPATH"
102+
echo "";
103+
104+
CCAPP_CLASS_NAME=$1
105+
106+
if [ "$CCAPP_CLASS_NAME" = "" ]; then
107+
echo "No Main Class Specified!"
108+
exit 1;
109+
fi
110+
111+
echo "CCAPP_CLASS_NAME:$CCAPP_CLASS_NAME"
112+
CCAPP_NAME=`echo $CCAPP_CLASS_NAME | sed 's/.*\.\(.*\)$/\1/'`
113+
echo "CCAPP_NAME:$CCAPP_NAME"
114+
CCAPP_LOG_FILE=$CCAPP_NAME.log
115+
116+
if [ "$JAVA_HEAP_MAX" = "" ]; then
117+
JAVA_HEAP_MAX=-Xmx2000m
118+
fi
119+
120+
JAVA="$JAVA_HOME/bin/java"
121+
122+
#establish hadoop platform name string
123+
JAVA_PLATFORM=`CLASSPATH=${CLASSPATH} ${JAVA} org.apache.hadoop.util.PlatformName | sed -e 's/ /_/g' | sed -e "s/ /_/g"`
124+
echo Platform Name is:${JAVA_PLATFORM}
125+
#setup commoncrawl library paths
126+
JAVA_LIBRARY_PATH=${CCAPP_LIB_DIR}:${CCAPP_LIB_DIR}/native/${JAVA_PLATFORM}
127+
#setup execution path
128+
export PATH=${CCAPP_LIB_DIR}/native/${JAVA_PLATFORM}:$PATH
129+
#and ld_library path
130+
#export LD_LIBRARY_PATH=${CCAPP_LIB_DIR}/native/${JAVA_PLATFORM}:$LD_LIBRARY_PATH
131+
132+
CCAPP_VMARGS="$CCAPP_VMARGS -Dcommoncrawl.log.dir=$CCAPP_LOG_DIR"
133+
CCAPP_VMARGS="$CCAPP_VMARGS -Dcommoncrawl.log.file=$CCAPP_LOG_FILE"
134+
CCAPP_VMARGS="$CCAPP_VMARGS -Dhadoop.home.dir=$HADOOP_HOME"
135+
CCAPP_VMARGS="$CCAPP_VMARGS -Dcommoncrawl.root.logger=${CCAPP_ROOT_LOGGER:-INFO,DRFA}"
136+
CCAPP_VMARGS="$CCAPP_VMARGS $JAVA_HEAP_MAX"
137+
CCAPP_VMARGS="$CCAPP_VMARGS -XX:+UseParNewGC -XX:ParallelGCThreads=8 -XX:NewSize=200m -XX:+PrintGCDetails"
138+
CCAPP_VMARGS="$CCAPP_VMARGS -Djava.library.path=${JAVA_LIBRARY_PATH}"
139+
CCAPP_VMARGS="$CCAPP_VMARGS -Dcc.native.lib.path=${CCAPP_LIB_DIR}/native/${JAVA_PLATFORM}"
140+
141+
142+
CCAPP_CMD_LINE="$JAVA $CCAPP_VMARGS -classpath $CLASSPATH $CCAPP_CLASS $@"
143+
CCAPP_RUN_LOG=$CCAPP_LOG_DIR/${CCAPP_NAME}_run.log
144+
echo "CCAPP_CMD_LINE:$CCAPP_CMD_LINE"
145+
nohup $CCAPP_CMD_LINE "$@" > $CCAPP_RUN_LOG 2>&1 < /dev/null &
146+
echo $! > "/tmp/${CCAPP_NAME}.pid"
147+
echo "Process PID Is:"$! " StdOut,StdErr logged to:" $CCAPP_RUN_LOG
148+
sleep 1; head "$CCAPP_RUN_LOG"
149+
150+
151+

build.properties

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
hadoop.version=hadoop-0.20.2+320
2+
hadoop.path=./lib/third_party/hadoop-0.20.2+320

build.xml

+187
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,187 @@
1+
<?xml version="1.0"?>
2+
3+
<project name="commoncrawl" default="compile">
4+
5+
<!-- Load all the default properties, and any the user wants -->
6+
<!-- to contribute (without having to type -D or edit this file -->
7+
<property file="${user.home}/build.properties" />
8+
<property file="${basedir}/build.properties" />
9+
<property name="Name" value="commoncrawl"/>
10+
<property name="name" value="commoncrawl"/>
11+
<property name="version" value="0.1"/>
12+
<property name="final.name" value="${name}-${version}"/>
13+
14+
<fail message="Please define Hadoop Version via hadoop.version in your build.properties file">
15+
<condition>
16+
<not>
17+
<isset property="hadoop.version"/>
18+
</not>
19+
</condition>
20+
</fail>
21+
<fail message="Please define Hadoop Base Path via hadoop.path in your build.properties file">
22+
<condition>
23+
<not>
24+
<isset property="hadoop.path"/>
25+
</not>
26+
</condition>
27+
</fail>
28+
29+
<property name="src.dir" value="${basedir}/src"/>
30+
<property name="amazon.src.dir" value="${basedir}/lib/third_party/amazon"/>
31+
<property name="lib.dir" value="${basedir}/lib"/>
32+
<property name="conf.dir" value="${basedir}/conf"/>
33+
34+
<property name="build.dir" value="${basedir}/build"/>
35+
<property name="build.classes" value="${build.dir}/classes"/>
36+
<property name="build.src" value="${build.dir}/src"/>
37+
<property name="build.webapps" value="${build.dir}/webapps"/>
38+
<property name="build.anttasks" value="${build.dir}/ant"/>
39+
40+
<!-- convert spaces to _ so that mac os doesn't break things -->
41+
<exec executable="sed" inputstring="${os.name}"
42+
outputproperty="nonspace.os">
43+
<arg value="s/ /_/g"/>
44+
</exec>
45+
46+
<property name="build.platform"
47+
value="${nonspace.os}-${os.arch}-${sun.arch.data.model}"/>
48+
49+
50+
<property name="build.encoding" value="ISO-8859-1"/>
51+
<property name="dist.dir" value="${build.dir}/${final.name}"/>
52+
53+
<property name="javac.debug" value="on"/>
54+
<property name="javac.optimize" value="on"/>
55+
<property name="javac.deprecation" value="off"/>
56+
<property name="javac.version" value="1.5"/>
57+
<property name="javac.args" value=""/>
58+
<property name="javac.args.warnings" value=""/>
59+
60+
61+
<!-- the normal classpath -->
62+
<echo message="Processing Class Path"/>
63+
<path id="classpath">
64+
<pathelement location="${build.classes}"/>
65+
<fileset dir="${lib.dir}">
66+
<include name="jets3t*.jar"/>
67+
</fileset>
68+
<fileset dir="${lib.dir}">
69+
<include name="*.jar" />
70+
<include name="**/*.jar" />
71+
</fileset>
72+
<fileset dir="${hadoop.path}">
73+
<include name="lib/**/*.jar"/>
74+
<include name="hadoop-*-core.jar"/>
75+
<exclude name="lib/jets3t-0.6.1.jar"/>
76+
</fileset>
77+
</path>
78+
79+
<echo message="classpath:${classpath}" />
80+
<!-- ====================================================== -->
81+
<!-- Stuff needed by all targets -->
82+
<!-- ====================================================== -->
83+
<echo message="Processing Init Target"/>
84+
<target name="init">
85+
<echo message="Executing Init Target"/>
86+
87+
<mkdir dir="${build.dir}"/>
88+
<mkdir dir="${build.classes}"/>
89+
<mkdir dir="${build.src}"/>
90+
<mkdir dir="${build.anttasks}"/>
91+
92+
</target>
93+
94+
<!-- ====================================================== -->
95+
<!-- Compile the Java files -->
96+
<!-- ====================================================== -->
97+
<echo message="Processing Compile Core Classes Target"/>
98+
<target name="compile-core-classes" depends="init">
99+
100+
<!-- Compile Amazon Java files -->
101+
<javac
102+
encoding="${build.encoding}"
103+
srcdir="${amazon.src.dir}"
104+
includes="com/amazon/**/*.java"
105+
destdir="${build.classes}"
106+
debug="${javac.debug}"
107+
optimize="${javac.optimize}"
108+
target="${javac.version}"
109+
source="${javac.version}"
110+
deprecation="${javac.deprecation}" >
111+
<compilerarg line="${javac.args} ${javac.args.warnings}" />
112+
<classpath refid="classpath"/>
113+
</javac>
114+
115+
<!-- Compile Java files (excluding JSPs) checking warnings -->
116+
<javac
117+
encoding="${build.encoding}"
118+
srcdir="${src.dir};${build.src}"
119+
includes="org/commoncrawl/**/*.java"
120+
excludes="org/commoncrawl/**/OneService.java"
121+
destdir="${build.classes}"
122+
debug="${javac.debug}"
123+
optimize="${javac.optimize}"
124+
target="${javac.version}"
125+
source="${javac.version}"
126+
deprecation="${javac.deprecation}" >
127+
<compilerarg line="${javac.args} ${javac.args.warnings}" />
128+
<classpath refid="classpath"/>
129+
</javac>
130+
131+
<copy todir="${build.classes}">
132+
<fileset
133+
dir="${src.dir}"
134+
includes="**/*.properties"
135+
/>
136+
</copy>
137+
</target>
138+
139+
<echo message="Processing Compile Core Target"/>
140+
<target name="compile-core" depends="compile-core-classes">
141+
</target>
142+
143+
<echo message="Processing Compile Target"/>
144+
<target name="compile" depends="compile-core,jar">
145+
</target>
146+
147+
<!-- ================================================================== -->
148+
<!-- Make commoncrawl.jar -->
149+
<!-- ================================================================== -->
150+
<!-- -->
151+
<!-- ================================================================== -->
152+
<echo message="Processing Jar Target"/>
153+
<target name="jar" depends="compile-core">
154+
<jar jarfile="${build.dir}/${final.name}.jar" basedir="${build.classes}" duplicate="preserve" >
155+
156+
<manifest>
157+
<section name="org/commoncrawl">
158+
<attribute name="Implementation-Title" value="commoncrawl"/>
159+
<attribute name="Implementation-Version" value="${version}"/>
160+
<attribute name="Implementation-Vendor" value="CommonCrawl"/>
161+
</section>
162+
</manifest>
163+
<zipfileset dir="${conf.dir}" prefix="conf" >
164+
<include name="*" />
165+
</zipfileset>
166+
<zipfileset dir="${lib.dir}" prefix="lib" >
167+
<include name="*.jar" />
168+
</zipfileset>
169+
<zipfileset dir="${hadoop.path}/lib" prefix="lib" >
170+
<include name="**/*.jar" />
171+
</zipfileset>
172+
</jar>
173+
</target>
174+
175+
<!-- ================================================================== -->
176+
<!-- Clean. Delete the build files, and their directories -->
177+
<!-- ================================================================== -->
178+
179+
<echo message="Processing Clean Target"/>
180+
<target name="clean">
181+
<delete dir="${build.dir}"/>
182+
<delete>
183+
<fileset dir="${src.dir}" includes="**\/gen.stamp" />
184+
</delete>
185+
</target>
186+
<echo message="Done Processing Targets"/>
187+
</project>

conf/commons-logging.properties

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
#Logging Implementation
2+
3+
#Log4J
4+
org.apache.commons.logging.Log=org.apache.commons.logging.impl.Log4JLogger
5+
6+
#JDK Logger
7+
#org.apache.commons.logging.Log=org.apache.commons.logging.impl.Jdk14Logger

0 commit comments

Comments
 (0)