generateData.sh
上传用户:quxuerui
上传日期:2018-01-08
资源大小:41811k
文件大小:3k
源码类别:

网格计算

开发平台:

Java

  1. #!/usr/bin/env bash
  2. GRID_DIR=`dirname "$0"`
  3. GRID_DIR=`cd "$GRID_DIR"; pwd`
  4. source $GRID_DIR/gridmix-env
  5. # Smaller data set is used by default.
  6. COMPRESSED_DATA_BYTES=2147483648
  7. UNCOMPRESSED_DATA_BYTES=536870912
  8. INDIRECT_DATA_BYTES=58720256
  9. # Number of partitions for output data
  10. if [ -z ${NUM_MAPS} ] ; then
  11.   NUM_MAPS=100
  12. fi
  13. INDIRECT_DATA_FILES=200
  14. # If the env var USE_REAL_DATASET is set, then use the params to generate the bigger (real) dataset.
  15. if [ ! -z ${USE_REAL_DATASET} ] ; then
  16.   echo "Using real dataset"
  17.   # 2TB data compressing to approx 500GB
  18.   COMPRESSED_DATA_BYTES=2147483648000
  19.   # 500GB
  20.   UNCOMPRESSED_DATA_BYTES=536870912000
  21.   # Default approx 70MB per data file, compressed
  22.   INDIRECT_DATA_BYTES=58720256000 
  23. fi
  24. ${HADOOP_HOME}/bin/hadoop jar 
  25.   ${EXAMPLE_JAR} randomtextwriter 
  26.   -D test.randomtextwrite.total_bytes=${COMPRESSED_DATA_BYTES} 
  27.   -D test.randomtextwrite.bytes_per_map=$((${COMPRESSED_DATA_BYTES} / ${NUM_MAPS})) 
  28.   -D test.randomtextwrite.min_words_key=5 
  29.   -D test.randomtextwrite.max_words_key=10 
  30.   -D test.randomtextwrite.min_words_value=100 
  31.   -D test.randomtextwrite.max_words_value=10000 
  32.   -D mapred.output.compress=true 
  33.   -D mapred.map.output.compression.type=BLOCK 
  34.   -outFormat org.apache.hadoop.mapred.SequenceFileOutputFormat 
  35.   ${VARCOMPSEQ} &
  36. ${HADOOP_HOME}/bin/hadoop jar 
  37.   ${EXAMPLE_JAR} randomtextwriter 
  38.   -D test.randomtextwrite.total_bytes=${COMPRESSED_DATA_BYTES} 
  39.   -D test.randomtextwrite.bytes_per_map=$((${COMPRESSED_DATA_BYTES} / ${NUM_MAPS})) 
  40.   -D test.randomtextwrite.min_words_key=5 
  41.   -D test.randomtextwrite.max_words_key=5 
  42.   -D test.randomtextwrite.min_words_value=100 
  43.   -D test.randomtextwrite.max_words_value=100 
  44.   -D mapred.output.compress=true 
  45.   -D mapred.map.output.compression.type=BLOCK 
  46.   -outFormat org.apache.hadoop.mapred.SequenceFileOutputFormat 
  47.   ${FIXCOMPSEQ} &
  48. ${HADOOP_HOME}/bin/hadoop jar 
  49.   ${EXAMPLE_JAR} randomtextwriter 
  50.   -D test.randomtextwrite.total_bytes=${UNCOMPRESSED_DATA_BYTES} 
  51.   -D test.randomtextwrite.bytes_per_map=$((${UNCOMPRESSED_DATA_BYTES} / ${NUM_MAPS})) 
  52.   -D test.randomtextwrite.min_words_key=1 
  53.   -D test.randomtextwrite.max_words_key=10 
  54.   -D test.randomtextwrite.min_words_value=0 
  55.   -D test.randomtextwrite.max_words_value=200 
  56.   -D mapred.output.compress=false 
  57.   -outFormat org.apache.hadoop.mapred.TextOutputFormat 
  58.   ${VARINFLTEXT} &
  59. ${HADOOP_HOME}/bin/hadoop jar 
  60.   ${EXAMPLE_JAR} randomtextwriter 
  61.   -D test.randomtextwrite.total_bytes=${INDIRECT_DATA_BYTES} 
  62.   -D test.randomtextwrite.bytes_per_map=$((${INDIRECT_DATA_BYTES} / ${INDIRECT_DATA_FILES})) 
  63.   -D test.randomtextwrite.min_words_key=5 
  64.   -D test.randomtextwrite.max_words_key=5 
  65.   -D test.randomtextwrite.min_words_value=20 
  66.   -D test.randomtextwrite.max_words_value=20 
  67.   -D mapred.output.compress=true 
  68.   -D mapred.map.output.compression.type=BLOCK 
  69.   -outFormat org.apache.hadoop.mapred.TextOutputFormat 
  70.   ${FIXCOMPTEXT} &