To troubleshoot calico network issues within guest clusters, debug logging is indispensable. In this post, lets look at the steps that are required to enable debug logging for Calico.

We need to first scale down the tkg-controller-manager on the supervisor cluster as we do not want changes we make to the calico pods to be reset.

STEP 1: Scale down the tkg-controller-manager

  1. From the vCenter Server, login to the control plane VM. Password and IP details can be found using:
/usr/lib/vmware-wcp/decryptK8Pwd.py
Read key from file

Connected to PSQL

Cluster: domain-c8:dad7e875-3357-449b-809a-bf1783e3430d
IP: 172.16.0.201
PWD: zflRFNeKdd7F7RJJ1sawcmO+WsnQPRgfzkIfU9+v2ZXHdQ6g6mORIKsOVyyQ6AqK057q/9YzlvGS27nMUsS05T1IFflP
------------------------------------------------------------
  1. Login to the control plane VM
root@vcenter [ ~ ]# ssh 172.16.0.201
FIPS mode initialized
Password: 
  1. Scale down the vmware-system-tkg-controller-manager
kubectl -n vmware-system-tkg scale deploy/vmware-system-tkg-controller-manager --replicas=0
deployment.apps/vmware-system-tkg-controller-manager scaled

STEP 2: Login to the guest cluster

  1. For example: The guest cluster here is: demo-calico-tkc and the supervisor cluster namespace is gs-dev
 kubectl vsphere login -u administrator@vsphere.local --server=kube.gs.labs --insecure-skip-tls-verify --tanzu-kubernetes-cluster-name demo-calico-tkc --tanzu-kubernetes-cluster-namespace gs-dev
  1. Switch to the guest cluster context
root@debian:~# kubectl config use-context demo-calico-tkc
Switched to context "demo-calico-tkc".

STEP 3: Enable debug logging on the Calico Controller

  1. Edit the deployment `calico-kube-controllers``
root@debian:~# kubectl get deploy -A
NAMESPACE                      NAME                           READY   UP-TO-DATE   AVAILABLE   AGE
kube-system                    calico-kube-controllers        1/1     1            1           4h36m
kube-system                    coredns                        2/2     2            2           4h37m
vmware-system-cloud-provider   guest-cluster-cloud-provider   1/1     1            1           4h36m
vmware-system-csi              vsphere-csi-controller         1/1     1            1           4h36m

root@debian:~# kubectl edit deployment calico-kube-controllers -n kube-system
  1. Add LOG_LEVEL=debug as an environment variable under the containers: spec.
    spec:
      containers:
      - env:
        - name: ENABLED_CONTROLLERS
          value: node
        - name: DATASTORE_TYPE
          value: kubernetes
        - name: LOG_LEVEL
          value: debug
        image: localhost:5000/vmware.io/calico/kube-controllers:v3.11.2_vmware.1
  1. A new calico-controller pod will get deployed. If not delete the existing calico-controller pod.
root@debian:~# kubectl get pods -A
NAMESPACE                      NAME                                                             READY   STATUS    RESTARTS   AGE
kube-system                    calico-kube-controllers-65775d7cc6-r2jnm                         1/1     Running   0          2m21s
kube-system                    calico-node-gl229                                                1/1     Running   0          89m
kube-system                    calico-node-ld8fd                                                1/1     Running   0          83m
kube-system                    calico-node-pkfhn                                                1/1     Running   0          4h44m
  1. View Calico controller debug logs
root@debian:~# kubectl logs calico-kube-controllers-65775d7cc6-r2jnm -n kube-system
...
2021-09-21 05:43:43.866 [DEBUG][1] k8s.go 524: Performing 'Get' for ClusterInformation(default) 
2021-09-21 05:43:43.866 [DEBUG][1] customresource.go 208: Get custom Kubernetes resource Key=ClusterInformation(default) Resource="ClusterInformations" Revision=""
2021-09-21 05:43:43.866 [DEBUG][1] customresource.go 219: Get custom Kubernetes resource by name Key=ClusterInformation(default) Name="default" Namespace="" Resource="ClusterInformations" Revision=""
2021-09-21 05:43:43.869 [DEBUG][1] client.go 276: Cluster GUID value already set ClusterGUID="cbbe8dcd369a4543bcb77edf6b9a15e7"
2021-09-21 05:43:43.869 [DEBUG][1] client.go 286: DatastoreReady value already set DatastoreReady=(*bool)(0xc0003913e6)

STEP 4: Enable debug logging for the calico pods

  1. Edit the calico-node daemonset
root@debian:~# kubectl get ds -A
NAMESPACE            NAME                     DESIRED   CURRENT   READY   UP-TO-DATE   AVAILABLE   NODE SELECTOR                     AGE
kube-system          calico-node              3         3         3       1            3           beta.kubernetes.io/os=linux       4h50m
kube-system          kube-proxy               3         3         3       3            3           kubernetes.io/os=linux            4h51m
vmware-system-auth   guest-cluster-auth-svc   1         1         1       1            1           node-role.kubernetes.io/master=   4h51m
vmware-system-csi    vsphere-csi-node         3         3         3       3            3           <none>                            4h50m

root@debian:~# kubectl edit ds calico-node -n kube-system

  1. Add FELIX_LOGSEVERITYSCREEN as an environment variable under the containers: spec.
spec:
      containers:
      - env:
        - name: DATASTORE_TYPE
          value: kubernetes
        - name: WAIT_FOR_DATASTORE
          value: "true"
        - name: NODENAME
          valueFrom:
            fieldRef:
              apiVersion: v1
              fieldPath: spec.nodeName
        - name: CALICO_NETWORKING_BACKEND
          valueFrom:
            configMapKeyRef:
              key: calico_backend
              name: calico-config
        - name: CLUSTER_TYPE
          value: k8s,bgp
        - name: IP
          value: autodetect
        - name: CALICO_IPV4POOL_IPIP
          value: Always
        - name: FELIX_IPINIPMTU
          valueFrom:
            configMapKeyRef:
              key: veth_mtu
              name: calico-config
        - name: CALICO_IPV4POOL_CIDR
          value: 192.168.200.0/24
        - name: CALICO_DISABLE_FILE_LOGGING
          value: "true"
        - name: FELIX_DEFAULTENDPOINTTOHOSTACTION
          value: ACCEPT
        - name: FELIX_IPV6SUPPORT
          value: "false"
        - name: FELIX_LOGSEVERITYSCREEN
          value: Debug
  1. Delete the existing calico-node pods
root@debian:~# kubectl get pods -A
NAMESPACE                      NAME                                                             READY   STATUS    RESTARTS   AGE
kube-system                    calico-kube-controllers-65775d7cc6-r2jnm                         1/1     Running   0          13m
kube-system                    calico-node-gl229                                                1/1     Running   0          100m
kube-system                    calico-node-ld8fd                                                1/1     Running   0          95m
kube-system                    calico-node-pkfhn                                                1/1     Running   0          4h56m

root@debian:~# kubectl delete pod calico-node-ld8fd -n kube-system
  1. To view the debug logs for each node:
kubectl logs calico-node-gl229 -n kube-system
...
2021-09-21 05:55:33.297 [DEBUG][41] health.go 131: New health report lastReport=health.HealthReport{Live:true, Ready:true} name="async_calc_graph" newReport=&health.HealthReport{Live:true, Ready:true}
2021-09-21 05:55:33.668 [DEBUG][41] ipip_mgr.go 103: Configuring IPIP tunnel mtu=1440 tunnelAddr=192.168.200.64
2021-09-21 05:55:33.668 [DEBUG][41] ipip_mgr.go 155: Setting local IPv4 address on link. addr=192.168.200.64 link="tunl0"
2021-09-21 05:55:33.668 [DEBUG][41] ipip_mgr.go 171: Address already present. addr=192.168.200.64 link="tunl0"
2021-09-21 05:55:33.668 [DEBUG][41] ipip_mgr.go 197: Address set. addr=192.168.200.64 link="tunl0"

Revert Changes To revert changes, scale the tkg-controller-manager back to 3 again on the control plane VM

kubectl -n vmware-system-tkg scale deploy/vmware-system-tkg-controller-manager --replicas=3
deployment.apps/vmware-system-tkg-controller-manager scaled