dmwm · mrceyhun · Sep 7, 2022 · Sep 11, 2022 · Oct 4, 2022 · vkuznet
diff --git a/bin/cron4wma_crab_ds_access.sh b/bin/cron4wma_crab_ds_access.sh
@@ -0,0 +1,132 @@
+#!/bin/bash
+set -e
+# Author: Ceyhun Uzunoglu <ceyhunuzngl AT gmail [DOT] com>
+##H Cron job of wmarchive_crab_file_access.py which sends Spark agg results to MONIT via StompAMQ
+##H See PySpark job for detialed explanation.
+##H
+##H Usage:
+##H    cron4wma_crab_ds_access.sh \
+##H        <keytab> value <amq> value <cmsmonitoring> value <stomp> value
+##H
+##H Example :
+##H    cron4wma_crab_ds_access.sh \
+##H        --keytab ./keytab --amq ./amq-creds.json --cmsmonitoring ./CMSMonitoring.zip --stomp ./stomp-v700.zip \
+##H        --p1 32000 --p2 32001 --host $MY_NODE_NAME --wdir $WDIR
+##H Arguments with values:
+##H   - keytab             : Kerberos auth file: secrets/kerberos
+##H   - amq                : AMQ credentials and configurations json file for the used AMQ topic
+##H   - cmsmonitoring      : dmwm/CMSMonitoring/src/python/CMSMonitoring folder as zip to be sent to Spark nodes
+##H   - stomp              : stomp.py==7.0.0 module as zip to be sent to Spark nodes which has lower versions.
+##H   - p1, p2, host, wdir : [ALL FOR K8S] p1 and p2 spark required ports(driver and blockManager), host is k8s node dns alias, wdir is working directory
+##H   - test                : will run only test job which will send only 10 documents to AMQ topic. Please give test/training AMQ credentials
+##H
+TZ=UTC
+START_TIME=$(date +%s)
+script_dir="$(
+    cd -- "$(dirname "$0")" >/dev/null 2>&1
+    pwd -P
+)"
+# get common util functions
+. "$script_dir"/utils/common_utils.sh
+
+trap 'onFailExit' ERR
+onFailExit() {
+    util4loge "finished with error!" || exit 1
+}
+# ------------------------------------------------------------------------------------------------------- GET USER ARGS
+unset -v KEYTAB_SECRET AMQ_JSON_CREDS CMSMONITORING_ZIP STOMP_ZIP PORT1 PORT2 K8SHOST WDIR IS_TEST help
+[ "$#" -ne 0 ] || usage
+
+# --options (short options) is mandatory, and v is a dummy param.
+PARSED_ARGS=$(getopt --unquoted --options v,h --name "$(basename -- "$0")" --longoptions keytab:,amq:,cmsmonitoring:,stomp:,p1:,p2:,host:,wdir:,test,help -- "$@")
+VALID_ARGS=$?
+if [ "$VALID_ARGS" != "0" ]; then
+    usage
+fi
+
+echo "$(date --rfc-3339=seconds)" "[INFO] Given arguments: $PARSED_ARGS"
+eval set -- "$PARSED_ARGS"
+
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+    --keytab)        KEYTAB_SECRET=$2     ; shift 2 ;;
+    --amq)           AMQ_JSON_CREDS=$2    ; shift 2 ;;
+    --cmsmonitoring) CMSMONITORING_ZIP=$2 ; shift 2 ;;
+    --stomp)         STOMP_ZIP=$2         ; shift 2 ;;
+    --p1)            PORT1=$2             ; shift 2 ;;
+    --p2)            PORT2=$2             ; shift 2 ;;
+    --host)          K8SHOST=$2           ; shift 2 ;;
+    --wdir)          WDIR=$2              ; shift 2 ;;
+    --test)          IS_TEST=1            ; shift   ;;
+    -h | --help)     help=1               ; shift   ;;
+    *) break ;;
+    esac
+done
+
+if [[ "$help" == 1 ]]; then
+    util_usage_help
+fi
+# ------------------------------------------------------------------------------------------------------------- PREPARE
+export PYTHONPATH=$script_dir/../src/python:$PYTHONPATH
+# run util to check PYTHONPATH
+util_check_pythonpath_for_cmsspark
+
+# Define logs path for Spark imports which produce lots of info logs
+LOG_DIR="$WDIR"/logs/$(date +%Y%m%d)
+mkdir -p "$LOG_DIR"
+
+#  check files exist
+util_check_files "$KEYTAB_SECRET" "$AMQ_JSON_CREDS" "$CMSMONITORING_ZIP" "$STOMP_ZIP"
+# check variables set
+util_check_vars PORT1 PORT2 K8SHOST WDIR
+
+# INITIALIZE ANALYTIX SPARK3
+util_setup_spark_k8s
+
+# Authenticate kerberos and get principle user name
+KERBEROS_USER=$(util_kerberos_auth_with_keytab "$KEYTAB_SECRET")
+util4logi "authenticated with ${KERBEROS_USER} user's keytab"
+
+# INFO LOGS
+util4logi "used variables ..."
+util4logi "KERBEROS_USER: ${KERBEROS_USER} , authenticated with user's keytab"
+util4logi "LOG_DIR: ${LOG_DIR}"
+
+# ------------------------------------------------------------------------------------------------------- RUN SPARK JOB
+# Required for Spark job in K8s
+util4logi "spark job starts"
+
+# Executor memory 16g is required because this spark job do heavy joins on 10 tables, half of them are big ones
+spark_submit_args=(
+    --master yarn --conf spark.ui.showConsoleProgress=false --conf "spark.driver.bindAddress=0.0.0.0"
+    --driver-memory=8g --executor-memory=16g
+    --conf "spark.driver.host=${K8SHOST}" --conf "spark.driver.port=${PORT1}" --conf "spark.driver.blockManager.port=${PORT2}"
+    --packages org.apache.spark:spark-avro_2.12:3.2.1 --py-files "${CMSMONITORING_ZIP},${STOMP_ZIP}"
+)
+py_input_args=(--creds "$AMQ_JSON_CREDS" --amq_batch_size 1000)
+
+function run_spark() {
+    spark-submit "${spark_submit_args[@]}" "${script_dir}/../src/python/CMSSpark/wmarchive_crab_file_access.py" \
+        "${py_input_args[@]}" >>"${LOG_DIR}/spark-wma_crab_ds_access.log" 2>&1
+}
+
+function run_test_spark() {
+    # Test will send 10 documents to AMQ topic
+    py_input_args+=(--test --start_date 2022-09-01 --end_date 2022-09-03)
+    spark-submit "${spark_submit_args[@]}" "${script_dir}/../src/python/CMSSpark/wmarchive_crab_file_access.py" \
+        "${py_input_args[@]}" >>"${LOG_DIR}/spark-wma_crab_ds_access.log" 2>&1
+}
+
+# RUN SPARK
+if [[ "$IS_TEST" == 1 ]]; then
+    # will send only 10 documents, only to test/training AMQ topic. Please check python script for more details.
+    run_test_spark 2>&1
+else
+    run_spark 2>&1
+fi
+
+util4logi "last 10 lines of spark job log"
+tail -10 "${LOG_DIR}/spark-wma_crab_ds_access.log"
+
+duration=$(($(date +%s) - START_TIME))
+util4logi "all finished, time spent: $(util_secs_to_human $duration)"
diff --git a/doc/pyspark_shell.md b/doc/pyspark_shell.md
@@ -1,8 +1,10 @@
 ## How to run PySpark shell for tests in Kubernetes pods or VMs
 
-If SWAN.cern.ch is not working, you can use PySpark to run your PySpark code. It gives nice IPython shell depending on your python environment.
+If SWAN.cern.ch is not working, you can use PySpark to run your PySpark code. It gives nice IPython shell depending on
+your python environment.
 
 - Kerberos authentication:
+
 ```
 kinit $USER#CERN.CH
 ```
@@ -11,28 +13,35 @@ kinit $USER#CERN.CH
 
 - You need to be in LxPlus7
 - If you use additional Python repositories, please make sure that they are in `PYTHONPATH`
-- `--py-files` is optional, just to put there to show how you can add
+- `--py-files` is optional, just to show how you can add it
+
+> Attention : do not use `LCG102` for now, it produces `ImportError: libffi.so.8` error in LxPlus7.
+>
+> For that reason, you need to provide Avro package like `org.apache.spark:spark-avro_2.12:3.1.2` with `3.1.2` version
+> which is `3.2.1` in LCG102.
+>
+> In any case, please set avro version according to `spark-submit --version`
 
 ###### Run in LxPlus7
+
 ```
 # Setup Analytix connection
 
 source /cvmfs/sft.cern.ch/lcg/views/LCG_101/x86_64-centos7-gcc8-opt/setup.sh
 source /cvmfs/sft.cern.ch/lcg/etc/hadoop-confext/hadoop-swan-setconf.sh analytix 3.2 spark3
 export PATH="${PATH}:/usr/hdp/hadoop/bin/hadoop:/usr/hdp/spark3/bin:/usr/hdp/sqoop/bin"
+export PYSPARK_DRIVER_PYTHON=ipython
+# Set ipython as driver python
 
 # Required Spark confs
 spark_submit_args=(
   --master yarn 
   --conf spark.ui.showConsoleProgress=false 
   --driver-memory=8g --executor-memory=8g
-  --packages org.apache.spark:spark-avro_2.12:3.2.1 
+  --packages org.apache.spark:spark-avro_2.12:3.1.2 
   --py-files "/data/CMSMonitoring.zip,/data/stomp-v700.zip"
 )
 
-# Set ipython as driver python
-export PYSPARK_DRIVER_PYTHON=ipython
-
 # Run
 pyspark ${spark_submit_args[@]}
 
@@ -49,11 +58,14 @@ pyspark ${spark_submit_args[@]}
 - You need to define :`spark.driver.bindAddress, spark.driver.host, spark.driver.port, spark.driver.blockManager.port`
 - Kubernetes ports should be open in both way In/Out like NodePort
 - If you use additional Python repositories, please make sure that they are in `PYTHONPATH`
-- `--py-files` is optional, just to put there to show how you can add
+- `--py-files` is optional, just to show how you can add it
 
 ###### Run in Kubernetes Pod
 
 ```
+# Set ipython as driver python
+export PYSPARK_DRIVER_PYTHON=ipython
+
 # Required Spark confs
 spark_submit_args=(
   --master yarn 
@@ -67,9 +79,6 @@ spark_submit_args=(
   --py-files "/data/CMSMonitoring.zip,/data/stomp-v700.zip"
 )
 
-# Set ipython as driver python
-export PYSPARK_DRIVER_PYTHON=ipython
-
 # Run
 pyspark ${spark_submit_args[@]}