Thursday, 3 October 2019

Clusterware Startup issue and solution



Clusterware Startup issue




clusertware start up issue =node startup issue


1) verify clusterware status

root@node1 ~]# /u02/oracle/12.1.0.2/grid/bin/crsctl stat res -t
CRS-4535: Cannot communicate with Cluster Ready Services
CRS-4000: Command Status failed, or completed with errors.
[root@node1 ~]#

[root@node1 ~]# /u02/oracle/12.1.0.2/grid/bin/crsctl stat res -t -init

[root@node1 ~]# ps -ef|grep d.bin
root 5484 1 29 05:24 ? 00:55:50 /u02/oracle/12.1.0.2/grid/bin/ohasd.bin reboot
root 6020 1 0 05:24 ? 00:00:34 /u02/oracle/12.1.0.2/grid/bin/orarootagent.bin
oracrs 6116 1 0 05:24 ? 00:00:33 /u02/oracle/12.1.0.2/grid/bin/oraagent.bin
oracrs 6128 1 0 05:24 ? 00:00:29 /u02/oracle/12.1.0.2/grid/bin/mdnsd.bin
oracrs 6132 1 2 05:24 ? 00:04:58 /u02/oracle/12.1.0.2/grid/bin/evmd.bin
oracrs 6163 1 0 05:24 ? 00:00:31 /u02/oracle/12.1.0.2/grid/bin/gpnpd.bin
oracrs 6180 6132 1 05:24 ? 00:03:33 /u02/oracle/12.1.0.2/grid/bin/evmlogger.bin -o /u02/oracle/12.1.0.2/grid/log/[HOSTNAME]/evmd/evmlogger.info -l /u02/oracle/12.1.0.2/grid/log/[HOSTNAME]/evmd/evmlogger.log
oracrs 6255 1 47 05:24 ? 01:27:47 /u02/oracle/12.1.0.2/grid/bin/gipcd.bin
root 20116 1 0 05:27 ? 00:00:39 /u02/oracle/12.1.0.2/grid/bin/cssdmonitor
root 116579 105692 0 08:31 pts/1 00:00:00 grep d.bin


1. check complete ASM alertlog ( do not cut ) .
2. verify generated html by executing script1 using the MOS Note : Doc ID 470211.1
3. ls -lL /dev/oracleasm/disks/*
5. $ kfod disks=all >>>>>>>>>>>>>> Execute this as Grid Owner.
6. # ls -l <GridHome>/bin/oracle
7. # ls -l <RDBMSHome>/bin/oracle
8. # crsctl stat res -t -init
9. # ps -ef|grep d.bin

[root@node1 tmp]# ls -lL /dev/oracleasm/disks/*
brw-rw---- 1 oracrs asmadmin 202, 65 Oct 1 05:24 /dev/oracleasm/disks/ACFS_DATA1
brw-rw---- 1 oracrs asmadmin 202, 81 Oct 1 05:24 /dev/oracleasm/disks/ACFS_DATA2
brw-rw---- 1 oracrs asmadmin 202, 97 Oct 1 05:24 /dev/oracleasm/disks/ACFS_DATA3
brw-rw---- 1 oracrs asmadmin 202, 113 Oct 1 05:24 /dev/oracleasm/disks/OCR_VOTE01

2. verify generated html by executing script1 using the MOS Note : Doc ID 470211.1
details commands


SPOOL ASM<#>_GENERIC_ASM_METADATA.html
-- ASM VERSIONS 10.1, 10.2, 11.1, 11.2, 12.1 & 12.2
SET MARKUP HTML ON
SET ECHO ON

SET PAGESIZE 200

ALTER SESSION SET NLS_DATE_FORMAT='DD-MON-YYYY HH24:MI:SS';

SELECT 'THIS ASM REPORT WAS GENERATED AT: ==)> ' , SYSDATE " " FROM DUAL;
SELECT 'INSTANCE NAME: ==)> ' , INSTANCE_NAME " " FROM V$INSTANCE;

SELECT 'HOSTNAME ASSOCIATED WITH THIS ASM INSTANCE: ==)> ' , MACHINE " " FROM V$SESSION WHERE PROGRAM LIKE '%SMON%';

SELECT * FROM V$INSTANCE;

SELECT * FROM GV$INSTANCE;

SELECT * FROM V$ASM_DISKGROUP;

SELECT GROUP_NUMBER, DISK_NUMBER, MOUNT_STATUS, HEADER_STATUS, MODE_STATUS, STATE, OS_MB, TOTAL_MB, FREE_MB, NAME, FAILGROUP, PATH
FROM V$ASM_DISK ORDER BY GROUP_NUMBER, FAILGROUP, DISK_NUMBER;

SELECT * FROM V$ASM_DISK ORDER BY GROUP_NUMBER,DISK_NUMBER;

SELECT SUBSTR(D.NAME,1,16) AS ASMDISK, D.MOUNT_STATUS, D.STATE,
DG.NAME AS DISKGROUP FROM V$ASM_DISKGROUP DG, V$ASM_DISK D
WHERE DG.GROUP_NUMBER = D.GROUP_NUMBER;

SELECT * FROM V$ASM_CLIENT;

SELECT DG.NAME AS DISKGROUP, SUBSTR(C.INSTANCE_NAME,1,12) AS INSTANCE,
SUBSTR(C.DB_NAME,1,12) AS DBNAME, SUBSTR(C.SOFTWARE_VERSION,1,12) AS SOFTWARE,
SUBSTR(C.COMPATIBLE_VERSION,1,12) AS COMPATIBLE
FROM V$ASM_DISKGROUP DG, V$ASM_CLIENT C
WHERE DG.GROUP_NUMBER = C.GROUP_NUMBER;

SELECT * FROM V$ASM_ATTRIBUTE;

SELECT * FROM V$ASM_OPERATION;
SELECT * FROM GV$ASM_OPERATION;

SELECT * FROM V$VERSION;

SELECT * FROM V$ASM_ACFSSNAPSHOTS;
SELECT * FROM V$ASM_ACFSVOLUMES;
SELECT * FROM V$ASM_FILESYSTEM;
SELECT * FROM V$ASM_VOLUME;
SELECT * FROM V$ASM_VOLUME_STAT;

SELECT * FROM V$ASM_USER;
SELECT * FROM V$ASM_USERGROUP;
SELECT * FROM V$ASM_USERGROUP_MEMBER;

SELECT * FROM V$ASM_DISK_IOSTAT;
SELECT * FROM V$ASM_DISK_STAT;
SELECT * FROM V$ASM_DISKGROUP_STAT;

SELECT * FROM V$ASM_TEMPLATE;

SHOW PARAMETER

SHOW SGA

!echo "SELECT '" > /tmp/GPNPTOOL.SQL 2> /dev/null
! $ORACLE_HOME/bin/gpnptool get >> /tmp/GPNPTOOL.SQL 2>> /dev/null
!echo "' FROM DUAL;" >> /tmp/GPNPTOOL.SQL 2>> /dev/null
! cat /tmp/GPNPTOOL.SQL
SET ECHO OFF

--DISPLAYS INFORMATION ABOUT THE CONTENTS OF THE SPFILE.
SELECT * FROM V$SPPARAMETER ORDER BY 2;
SELECT * FROM GV$SPPARAMETER ORDER BY 3;

--DISPLAYS INFORMATION ABOUT THE INITIALIZATION PARAMETERS THAT ARE CURRENTLY IN EFFECT IN THE INSTANCE.
SELECT * FROM V$SYSTEM_PARAMETER ORDER BY 2;
SELECT * FROM GV$SYSTEM_PARAMETER ORDER BY 3;

-- 12C ACFS VIEWS

SELECT * FROM V$ASM_ACFS_ENCRYPTION_INFO;
SELECT * FROM V$ASM_ACFSREPL;
SELECT * FROM V$ASM_ACFSREPLTAG;
SELECT * FROM V$ASM_ACFS_SEC_ADMIN;
SELECT * FROM V$ASM_ACFS_SEC_CMDRULE;
SELECT * FROM V$ASM_ACFS_SEC_REALM;
SELECT * FROM V$ASM_ACFS_SEC_REALM_FILTER;
SELECT * FROM V$ASM_ACFS_SEC_REALM_GROUP;
SELECT * FROM V$ASM_ACFS_SEC_REALM_USER;
SELECT * FROM V$ASM_ACFS_SEC_RULE;
SELECT * FROM V$ASM_ACFS_SEC_RULESET;
SELECT * FROM V$ASM_ACFS_SEC_RULESET_RULE;
SELECT * FROM V$ASM_ACFS_SECURITY_INFO;
SELECT * FROM V$ASM_ACFSTAG;

-- 12C ASM AUDIT VIEWS

SELECT * FROM V$ASM_AUDIT_CLEAN_EVENTS;
SELECT * FROM V$ASM_AUDIT_CLEANUP_JOBS;
SELECT * FROM V$ASM_AUDIT_CONFIG_PARAMS;
SELECT * FROM V$ASM_AUDIT_LAST_ARCH_TS;

-- 12C ASM ESTIMATE VIEW

SELECT * FROM V$ASM_ESTIMATE;
SELECT * FROM GV$ASM_ESTIMATE;

-- SPARSE Diskgroups VIEW

SELECT * FROM V$ASM_DISK_SPARSE;
SELECT * FROM V$ASM_DISKGROUP_SPARSE;

SPOOL OFF

EXIT

pool asm<#>_alias+files.html
-- ASM VERSIONS 10.1, 10.2, 11.1, 11.2, 12.1 & 12.2
SET MARKUP HTML ON
set echo on

set pagesize 200

COLUMN BYTES FORMAT 9999999999999999

alter session set nls_date_format='DD-MON-YYYY HH24:MI:SS';

select 'THIS ASM REPORT WAS GENERATED AT: ==)> ' , sysdate " " from dual;


select 'HOSTNAME ASSOCIATED WITH THIS ASM INSTANCE: ==)> ' , MACHINE " " from v$session where program like '%SMON%';

select * from v$asm_alias;

select * from v$asm_file;

show parameter asm
show parameter cluster
show parameter instance_type
show parameter instance_name
show parameter spfile

show sga

spool off


that is one possible workaround to delete the socket files.

Please execute the following action plan

1. Stop crs on issue Node

crsctl stop crs

if not stopping

crsctl stop crs -f

Make sure all the GI stack should be down. 'ps -ef | grep d.bin' (kill any processes found using os kill command)

$GRID_ORACLE_HOME/bin/crsctl status resource -t


2. Remove the socket files on issue node

please dont delete folder ,Please deleted socket file inside folder

root@node1 bin]# ls -ltra /var/tmp/.oracle

rm -rf /var/tmp/.oracle/*
rm -rf /tmp/.oracle/*

root@node1 bin]# ls -ltra /var/tmp/.oracle


96 -rw-r--r-- 1 fusadm dba 17 Sep 20 09:40 test_email
[root@node1 tmp]# ls -la
total 2344
drwxrwxrwt. 5 root root 4096 Oct 2 14:00 .
drwxr-xr-x. 22 root root 4096 Feb 27 2018 ..
-rw-r--r-- 1 fusadm dba 1976 Oct 1 03:45 apache_alert
-rw-r--r-- 1 fusadm dba 77 Oct 2 14:00 apache_alert.sh
-rw-r--r-- 1 fusadm dba 1976 Oct 1 03:45 apache_master
-rw-r--r-- 1 fusadm dba 2095832 Oct 1 00:39 cloud.txt
-rw-r--r-- 1 fusadm dba 3015 Sep 17 07:36 fil
-rw-r--r-- 1 fusadm dba 153 Sep 17 07:36 fil1
-rw-r--r-- 1 fusadm dba 153 Sep 17 07:36 fil1.srt
-rw------- 1 root root 1853 Jun 19 15:07 host_0
drwx------. 2 root root 16384 Oct 10 2017 lost+found
-rw-r--r-- 1 root root 157119 Aug 4 03:48 lpar2rrd-agent-10.160.36.212-root.err
-rw-r--r-- 1 root root 31647 Oct 2 14:00 lpar2rrd-agent-10.160.36.212-root-ps_job.txt
-rw-r--r-- 1 root root 0 Oct 2 13:49 lpar2rrd-agent-10.160.36.212-root.stamp
-rw-r--r-- 1 root root 10 Oct 2 13:49 lpar2rrd-agent-10.160.36.212-root.stamp-send
-rw-r--r-- 1 root root 0 Oct 2 07:57 lpar2rrd-agent-10.160.36.212-root.stamp-trimlogs
-rw-r--r-- 1 root root 4544 Oct 2 14:00 lpar2rrd-agent-10.160.36.212-root.txt
-rw-r--r-- 1 root root 4130 Oct 2 13:49 lpar2rrd-agent-10.160.36.212-root.txtorig
-rw-r--r-- 1 root root 0 Oct 2 13:49 lpar2rrd-agent-10.160.36.212-root.txt-tmp
-rw-r--r-- 1 root root 1761 Oct 2 14:00 lpar2rrd-agent.out
-rw-r--r-- 1 fusadm dba 2 Sep 13 11:39 mark
drwxrwxrwt 2 root dba 12288 Oct 2 01:43 oldoracle
drwxrwxrwt 2 root dba 4096 Oct 2 12:47 .oracle
-rw-r--r-- 1 fusadm dba 17 Sep 20 09:40 test_email
[root@node1 tmp]# cd .oracle
[root@node1 .oracle]# ls
npohasd ora_gipc_node1_DBG_OHASD ora_gipc_node1_DBG_OHASD_lock sOHASD_IPC_SOCKET_11 sOHASD_IPC_SOCKET_11_lock sOHASD_UI_SOCKET sOHASD_UI_SOCKET_lock sprocr_local_conn_0_PROL sprocr_local_conn_0_PROL_lock
[root@node1 .oracle]# ls -ltr
total 0
prw-r--r-- 1 root root 0 Oct 2 12:47 npohasd
-rw-r--r-- 1 root root 0 Oct 2 12:47 sprocr_local_conn_0_PROL_lock
srwxrwxrwx 1 root root 0 Oct 2 12:47 sprocr_local_conn_0_PROL
-rw-r--r-- 1 root root 0 Oct 2 12:47 ora_gipc_node1_DBG_OHASD_lock
srwxrwxrwx 1 root root 0 Oct 2 12:47 ora_gipc_node1_DBG_OHASD
-rw-r--r-- 1 root root 0 Oct 2 12:47 sOHASD_IPC_SOCKET_11_lock
srwxrwxrwx 1 root root 0 Oct 2 12:47 sOHASD_IPC_SOCKET_11
-rw-r--r-- 1 root root 0 Oct 2 12:47 sOHASD_UI_SOCKET_lock
srwxrwxrwx 1 root root 0 Oct 2 12:47 sOHASD_UI_SOCKET
[root@node1 .oracle]# rm *
rm: remove fifo `npohasd'? yes
rm: remove socket `ora_gipc_node1_DBG_OHASD'? yes
rm: remove regular empty file `ora_gipc_node1_DBG_OHASD_lock'? yes
rm: remove socket `sOHASD_IPC_SOCKET_11'? yes
rm: remove regular empty file `sOHASD_IPC_SOCKET_11_lock'? yes
rm: remove socket `sOHASD_UI_SOCKET'? yes
rm: remove regular empty file `sOHASD_UI_SOCKET_lock'? yes
rm: remove socket `sprocr_local_conn_0_PROL'? yes
rm: remove regular empty file `sprocr_local_conn_0_PROL_lock'? yes
[root@node1 .oracle]# ls
[root@node1 .oracle]# cat /oe
[root@node1 .oracle]# cat /etc/oratab
#Backup file is /u02/oracle/12.1.0.2/grid/srvm/admin/oratab.bak.node1 line added by Agent
#

3. Reboot node

4. crsctl stat res -t -init


or

if permission is not correct for .oracle folder for faulty node . Please check permission from serviving node and change same permission as below

chmod 1777 /var/tmp/.oracle
 chown root:dba /var/tmp/.oracle
to bring that directory on node1 into line with node2 & node3


[root@node1 tmp]# ls -la
total 2352
drwxrwxrwt. 5 root root 4096 Oct 2 12:46 .
drwxr-xr-x. 22 root root 4096 Feb 27 2018 ..
-rw-r--r-- 1 fusadm dba 1976 Oct 1 03:45 apache_alert
-rw-r--r-- 1 fusadm dba 77 Oct 2 12:45 apache_alert.sh
-rw-r--r-- 1 fusadm dba 1976 Oct 1 03:45 apache_master
-rw-r--r-- 1 fusadm dba 2095832 Oct 1 00:39 cloud.txt
-rw-r--r-- 1 fusadm dba 3015 Sep 17 07:36 fil
-rw-r--r-- 1 fusadm dba 153 Sep 17 07:36 fil1
-rw-r--r-- 1 fusadm dba 153 Sep 17 07:36 fil1.srt
-rw------- 1 root root 1853 Jun 19 15:07 host_0
drwx------. 2 root root 16384 Oct 10 2017 lost+found
-rw-r--r-- 1 root root 157119 Aug 4 03:48 lpar2rrd-agent-10.160.36.212-root.err
-rw-r--r-- 1 root root 32661 Oct 2 12:30 lpar2rrd-agent-10.160.36.212-root-ps_job.txt
-rw-r--r-- 1 root root 0 Oct 2 12:30 lpar2rrd-agent-10.160.36.212-root.stamp
-rw-r--r-- 1 root root 10 Oct 2 12:30 lpar2rrd-agent-10.160.36.212-root.stamp-send
-rw-r--r-- 1 root root 0 Oct 2 07:57 lpar2rrd-agent-10.160.36.212-root.stamp-trimlogs
-rw-r--r-- 1 root root 11383 Oct 2 12:46 lpar2rrd-agent-10.160.36.212-root.txt
-rw-r--r-- 1 root root 8691 Oct 2 12:30 lpar2rrd-agent-10.160.36.212-root.txtorig
-rw-r--r-- 1 root root 0 Oct 2 12:30 lpar2rrd-agent-10.160.36.212-root.txt-tmp
-rw-r--r-- 1 root root 1761 Oct 2 12:46 lpar2rrd-agent.out
-rw-r--r-- 1 fusadm dba 2 Sep 13 11:39 mark
drwxrwxrwt 2 root dba 12288 Oct 2 01:43 oldoracle
drwxr-xr-x 2 root root 4096 Oct 2 12:46 .oracle
-rw-r--r-- 1 fusadm dba 17 Sep 20 09:40 test_email
[root@node1 tmp]# cd .oracle
[root@node1 .oracle]# ls -ltr

[root@node1 bin]# pwd
/u02/oracle/12.1.0.2/grid/bin
[root@node1 bin]# ls -ld /u02/oracle/12.1.0.2/grid/mdns/init
drwxr-x--- 2 oracrs dba 4096 Oct 11 2017 /u02/oracle/12.1.0.2/grid/mdns/init
[root@node1 bin]# cd /u02/oracle/12.1.0.2/grid/mdns/init
[root@node1 bin]# ls -ld /u02/oracle/12.1.0.2/grid/mdns/init/node1
-rw-r--r-- 1 oracrs dba 0 Oct 11 2017 /u02/oracle/12.1.0.2/grid/mdns/init/node1
[root@node1 bin]#

or

Aware of network socket files before you cleanup in /var/tmp, /tmp, /usr/tmp, it's critical for oracle clusterware to run
ORA-29701: unable to connect to Cluster Synchronization Service
Unable To Connect To Cluster Manager as Network Socket Files are Removed

Purpose

This note explains relevant issues if Oracle Clusterware's network socket files are deleted or wrongly owned.


Details

Oracle Clusterware(CRS or Grid Infrastructure) network socket files are located in /tmp/.oracle, /usr/tmp/.oracle or /var/tmp/.oracle, it's important not to touch them manually unless instructed by Oracle Support to keep clusterware healthy.


Cause

The hidden directory '/var/tmp/.oracle' (or /tmp/.oracle on some platforms) or its content was removed while instances & the CRS stack were up and running. Typically this directory contains a number of "special" socket files that are used by local clients to connect via the IPC protocol (sqlnet) to various Oracle processes including the TNS listener, the CSS, CRS & EVM daemons or even database or ASM instances. These files are created when the "listening" process starts.
A typical listing of the '/var/tmp/.oracle' shows a number of such files:
# cd /var/tmp/.oracle
# ls -l
srwxrwxrwx 1 oracle dba 0 Sep 6 10:50 s#9862.2
srwxrwxrwx 1 oracle dba 0 Sep 15 11:35 sAracnode1_crs_evm
srwxrwxrwx 1 root root 0 Sep 15 11:35 sracnode1DBG_CRSD
srwxrwxrwx 1 oracle dba 0 Sep 15 11:34 sracnode1DBG_CSSD
srwxrwxrwx 1 oracle dba 0 Sep 15 11:35 sracnode1DBG_EVMD
srwxrwxrwx 1 oracle dba 0 Sep 15 11:35 sCracnode1_crs_evm
srwxrwxrwx 1 root root 0 Sep 15 11:35 sCRSD_UI_SOCKET
srwxrwxrwx 1 oracle dba 0 Sep 15 11:35 sEXTPROC
srwxrwxrwx 1 oracle dba 0 Sep 15 11:34 sOCSSD_LL_racnode1_crs
srwxrwxrwx 1 oracle dba 0 Sep 15 11:34 sOracle_CSS_LclLstnr_crs_1
srwxrwxrwx 1 root root 0 Sep 15 11:35 sora_crsqs
srwxrwxrwx 1 root root 0 Sep 15 11:35 sprocr_local_conn_0_PROC
srwxrwxrwx 1 oracle dba 0 Sep 15 11:35 sSYSTEM.evm.acceptor.auth

When a file is deleted on Unix, it becomes "invisible" at the filesystem level, however any process which had the file opened when it was deleted will still be able to use it.
Attempts to open a "deleted" file for reading will fail (ENOENT 2 /* No such file or directory */) , opening a file with the same name for writing will create a new (different) file.
Therefore only processes that attempted to open the socket file during the initial handshake were failing with ORA-29701 while existing processes were unaffected.
A very common cause for this issue are system administration activities that involve freeing up space in /tmp, /var/tmp etc - either run occasionally or regularly via cronjobs. As a rule of thumb the directory .oracle in /var/tmp or /tmp should always be excluded from such activities. The best time to completely clean out these directories would be during system boot - before the clusterware is started.

Solution

The only way to re-create these special files is to restart (instance, listener, CRS). In a RAC environment this requires the shutdown & restart of the entire CRS stack.

As these special files are required to communicate with the various CRS daemons, it most likely will not be possible to stop (and restart) the CRS stack using the following commands as user root - but it won't hurt to try it anyway:

11g:
# $ORA_CRS_HOME/bin/crsctl stop crs
# $ORA_CRS_HOME/bin/crsctl start crs

If the above fails to successfully stop the CRS stack, a system reboot will be inevitable.

As for deleting files from temporary directory via a cronjob (or otherwise):
the directory '/var/tmp/.oracle' (on some platform /tmp/.oracle) should be excluded from such jobs/tasks. The files in this directory occupy only a few bytes and generally do not need to be cleaned up.
Please note that the location of the .oracle directory is not configurable, so the only way to avoid such issues is to make sure it is not deleted while the clusterware is up & running.
If the specified temp location must be cleaned to release space, consider to delete files which meet both criterias:
or

2.1.0.2 Grid Infrastructure Stack not Start due to .pid file issues (Doc ID 2028511.1)

Applies to:
Oracle Database - Enterprise Edition - Version 12.1.0.2 and later
Information in this document applies to any platform.
Symptoms

On a 12.1.0.2 multi node cluster, when starting the clusterware, GI stack fails to start, this could happen to any of the resource managed by OHASD eg: ora.mdnsd (mdnsd.bin), ora.gipcd (gipcd.bin), ora.gpnpd (gpnpd.bin), ora.evmd (evmd.bin) etc.

Case I. No messages is written to the corresponding <process>.trc file. The node alert log (<GI home>/log/<node>/alert<node>.log) shows the following messages:
2015-07-07 14:43:22.594 [ORAAGENT(4642)]CRS-5818: Aborted command 'start' for resource 'ora.gipcd'. Details at (:CRSAGF00113:) {0:0:2} in /u01/app/grid/diag/crs/racnode1/crs/trace/ohasd_oraagent_grid.trc.
2015-07-07 14:43:23.496 [ORAAGENT(4642)]CRS-5017: The resource action "ora.gipcd start" encountered the following error:
2015-07-07 14:43:23.496+Start action for daemon aborted. For details refer to "(:CLSN00107:)" in "/u01/app/grid/diag/crs/racnode1/crs/trace/ohasd_oraagent_grid.trc".
2015-07-07 14:43:26.803 [OHASD(4253)]CRS-2757: Command 'Start' timed out waiting for response from the resource 'ora.gipcd'. Details at (:CRSPE00163:) {0:0:2} in /u01/app/grid/diag/crs/racnode1/crs/trace/ohasd.trc.


Case II. <ORACLE_BASE>/diag/crs/<host>/crs/trace/gipcd.trc shows:

2015-10-26 11:58:05.901092 : CLSDMT:180586240: PID for the Process [5395], connkey 13
2015-10-26 11:58:05.901470 : CLSDMT:180586240: Failed to record pid for GIPCD
2015-10-26 11:58:05.901474 : CLSDMT:180586240: Terminating process
...
trace file /home/grid/oraclebase/diag/crs/racnode1/crs/trace/gipcd.trc
Oracle Database 12c Clusterware Release 12.1.0.2.0 - Production Copyright 1996, 2014 Oracle. All rights reserved.
DDE: Flood control is not active
2015-10-26 11:59:05.909217 : GIPCD:180586240: gipcd_ExitCB: one or more of gipcdThreads failed to come into offline in 60 seconds of time, aborting the gipcd process
CLSB:180586240: Oracle Clusterware infrastructure error in GIPCD (OS PID 5395): Fatal signal 6 has occurred in program gipcd thread 180586240; nested signal count is 1
Incident 33 created, dump file: /home/grid/oraclebase/diag/crs/racnode1/crs/incident/incdir_33/gipcd_i33.trc
CRS-8503 [] [] [] [] [] [] [] [] [] [] [] []


Case III. crsctl start crs shows:

Start action for daemon aborted. For details refer to "(:CLSN00107:)" in "D:\app\grid\racdbauser\diag\crs\racnode1\crs\trace\ohasd_oraagent_system.trc".
CRS-2674: Start of 'ora.mdnsd' on 'racnode1' failed
CRS-2679: Attempting to clean 'ora.mdnsd' on 'racnode1'
CRS-2681: Clean of 'ora.mdnsd' on 'racnode1' succeeded
CRS-2672: Attempting to start 'ora.gpnpd' on 'racnode1'
CRS-5017: The resource action "ora.gpnpd start" encountered the following error:

Start action for daemon aborted. For details refer to "(:CLSN00107:)" in "D:\app\grid\racdbauser\diag\crs\racnode1\crs\trace\ohasd_oraagent_system.trc".
CRS-2883: Resource 'ora.gpnpd' failed during Clusterware stack start.
CRS-4406: Oracle High Availability Services synchronous start failed.
CRS-4000: Command Start failed, or completed with errors.
2015/11/30 18:32:21 CLSRSC-117: Failed to start Oracle Clusterware stack

Cause

The issues are caused by either the wrong ownership of the *.pid files or *OUT.trc files corresponding to the resources or the corresponding *.pid file missing.

Starting with Grid Infrastructure release 12.1.0.2, the pid file for each daemon process not only exists under <GRID_HOME>/<resource>/<host>.pid, but also exists under <ORACLE_BASE>/crsdata/<hostname>/output/<resource>.pid.


Case I. it is caused by wrong ownership for the *.pid file under <ORACLE_BASE>/crsdata/<hostname>/output/<resource>.pid.
-rw-r--r-- 1 root root 4943 Jun 18 07:20 /u01/app/grid/crsdata/racnode1/output/gipcdOUT.trc
-rw-r--r-- 1 root root 5 Jun 18 07:20 /u01/app/grid/crsdata/racnode1/output/gipcd.pid

Files are owned by root, and the <grid> user has no write privilege on them.

If the file permission of the <resource>.pid is not writable by the <grid> user, then the process will fail to start. This includes the gipcd.bin, gpnpd.bin, mdns.bin, evmd.bin etc.

When this happen, a file named /tmp/<resource>_<pid>.out is generated, for example: /tmp/gipcd_32055.out, with similar content like:
Oracle Clusterware infrastructure error in GIPCD (OS PID 32055): Error in an OS-dependent function or service
Error category: -2, operation: open, location: SCLSB00009, OS error: 13
OS error message: Permission denied
Additional information: Call to open daemon stdout/stderr file failed
Oracle Clusterware infrastructure fatal error in GIPCD (OS PID 32055): Internal error (ID (:CLSB00126:)) - Failed to redirect daemon standard outputs using location /u01/app/grid/crsdata/racnode1/output and root name gipcd

This file shows the exact path / location of files that need to be checked.

Case II. It is caused by .pid file under GRID_HOME/gipc/init/<host> or GRID_HOME/gipc/init/<host>.pid files not presented/not writable


Case III. It is caused by *.pid file missing from both locations due to manual removal of these files by accident

Solution

Case I. Change the ownership of the files to be owned by <grid> user, eg: as root user:
# cd /u01/app/grid/crsdata/racnode1/output
# chown grid:oinstall gipcd*

Make similar changes to other PID files if needed.

The clusterware should start up automatically post the change.

Case II. Touch two files and set the correct ownership and permission, eg:

as grid user:

touch <GRID_HOME>/gipc/init/racnode1
touch <GRID_HOME>/gipc/init/racnode1.pid
chmod 644 <GRID_HOME>/gipc/init/*

Restart the CRS stack.


Case III. Recreate an empty *.pid file for the missing pid file, set with correct ownership and permission.

As a reference, here are the *.pid files exist under <ORACLE_BASE>/crsdata/<hostname>/output/:
-rw-r--r--. 1 oracle oinstall 5 Dec 1 08:36 crsd_oraagent_oracle.pid
-rw-r--r--. 1 grid oinstall 5 Dec 1 08:35 crsd_oraagent_grid.pid
-rw-r--r--. 1 root root 5 Dec 1 08:36 crsd_orarootagent_root.pid
-rw-r--r--. 1 root root 5 Dec 1 08:35 crsd.pid
-rw-r--r--. 1 grid oinstall 5 Dec 1 08:36 crsd_scriptagent_grid.pid
-rw-r--r--. 1 grid oinstall 5 Dec 1 08:34 evmd.pid
-rw-r--r--. 1 grid oinstall 5 Dec 1 08:34 evmlogger.pid
-rw-r--r--. 1 grid oinstall 5 Dec 1 08:34 gipcd.pid
-rw-r--r--. 1 grid oinstall 5 Dec 1 08:34 gpnpd.pid
-rw-r--r--. 1 grid oinstall 5 Dec 1 08:34 mdnsd.pid
-rw-r--r--. 1 grid oinstall 5 Dec 1 08:34 ocssd.pid
-rw-r--r--. 1 root root 5 Dec 1 08:35 octssd.pid
-rw-r--r--. 1 root root 5 Dec 1 08:34 ohasd_cssdagent_root.pid
-rw-r--r--. 1 root root 5 Dec 1 08:34 ohasd_cssdmonitor_root.pid
-rw-r--r--. 1 grid oinstall 5 Dec 1 08:34 ohasd_oraagent_grid.pid
-rw-r--r--. 1 root root 5 Dec 1 08:35 ohasd_orarootagent_root.pid
-rw-r--r--. 1 root root 5 Dec 1 08:34 ohasd.pid
-rw-r--r--. 1 root root 5 Dec 1 08:35 ologgerd.pid
-rw-r--r--. 1 root root 5 Dec 1 08:35 osysmond.pid


Here are the pid files exist under GRID_HOME:
-rw-r--r--. 1 root root 0 Jul 29 14:52 ./crs/init/lccn0
-rw-r--r--. 1 root root 5 Dec 1 08:35 ./crs/init/lccn0.pid
-rw-r--r--. 1 root root 0 Jul 29 14:51 ./ctss/init/lccn0
-rw-r--r--. 1 root root 5 Dec 1 08:35 ./ctss/init/lccn0.pid
-rw-r--r--. 1 grid oinstall 0 Jul 29 14:50 ./evm/init/lccn0
-rw-r--r--. 1 grid oinstall 5 Dec 1 08:34 ./evm/init/lccn0.pid
-rw-r--r--. 1 grid oinstall 5 Dec 1 08:34 ./gipc/init/lccn0
-rw-r--r--. 1 grid oinstall 5 Dec 1 08:34 ./gipc/init/lccn0.pid
-rw-r--r--. 1 grid oinstall 0 Jul 29 14:50 ./gpnp/init/lccn0
-rw-r--r--. 1 grid oinstall 5 Dec 1 08:34 ./gpnp/init/lccn0.pid
-rw-r--r--. 1 grid oinstall 0 Jul 29 14:50 ./mdns/init/lccn0
-rw-r--r--. 1 grid oinstall 5 Dec 1 08:34 ./mdns/init/lccn0.pid
-rw-r--r--. 1 root root 0 Jul 29 14:50 ./ohasd/init/lccn0
-rw-r--r--. 1 root root 5 Dec 1 08:34 ./ohasd/init/lccn0.pid
-rw-r--r--. 1 root root 0 Jul 29 14:54 ./ologgerd/init/lccn0
-rw-r--r--. 1 root root 5 Dec 1 08:35 ./ologgerd/init/lccn0.pid
-rw-r--r--. 1 root root 0 Jul 29 14:52 ./osysmond/init/lccn0
-rw-r--r--. 1 root root 5 Dec 1 08:35 ./osysmond/init/lccn0.pid


For windows platform, the corresponding pid files should exist under <ORACLE_BASE>\Administrator\crsdata\<host>\output\:
crsd.pid
crsd_oraagent_system.pid
crsd_oraagent_system.pid
crsd_scriptagent_system.pid
crsd_orarootagent_system.pid
evmlogger.pid
evmd.pid
gpnpd.pid
gipcd.pid
mdnsd.pid
ocssd.pid
octssd.pid
ohasd.pid
ohasd_cssdagent_system.pid
ohasd_cssdmonitor_system.pid
ohasd_orarootagent_system.pid
ohasd_oraagent_system.pid
ologgerd.pid
osysmond.pid


<host> and <host>.pid files exist under <GRID_HOME>:
./crs/init/
./evm/init/
./gpnp/init/
./mdns/init/
./gipc/init/
./ohasd/init/
./ctss/init/
./osysmond/init/
./ologgerd/init/

Restart the CRS stack after above.


==========================


Top 5 Grid Infrastructure Startup Issues (Doc ID 1368382.1)


Applies to:
Oracle Database - Enterprise Edition - Version 11.2.0.1 to 11.2.0.4 [Release 11.2]
Oracle Database Cloud Schema Service - Version N/A and later
Oracle Database Exadata Express Cloud Service - Version N/A and later
Oracle Database Exadata Cloud Machine - Version N/A and later
Oracle Cloud Infrastructure - Database Service - Version N/A and later
Information in this document applies to any platform.
Purpose

The purpose of this note is to provide a summary of the top 5 issues that may prevent the successful startup of the Grid Infrastructure (GI) stack.
Scope

This note applies to 11gR2 Grid Infrastructure only.

To determine the status of GI, please run the following commands:
1. $GRID_HOME/bin/crsctl check crs
2. $GRID_HOME/bin/crsctl stat res -t -init
3. $GRID_HOME/bin/crsctl stat res -t
4. ps -ef | egrep 'init|d.bin'
Details
Issue #1: CRS-4639: Could not contact Oracle High Availability Services, ohasd.bin not running or ohasd.bin is running but no init.ohasd or other processes

Symptoms:

1. Command '$GRID_HOME/bin/crsctl check crs' returns error:
CRS-4639: Could not contact Oracle High Availability Services
2. Command 'ps -ef | grep init' does not show a line similar to:
root 4878 1 0 Sep12 ? 00:00:02 /bin/sh /etc/init.d/init.ohasd run
3. Command 'ps -ef | grep d.bin' does not show a line similar to:
root 21350 1 6 22:24 ? 00:00:01 /u01/app/11.2.0/grid/bin/ohasd.bin reboot
Or it may only show "ohasd.bin reboot" process without any other processes
4. ohasd.log report:
2013-11-04 09:09:15.541: [ default][2609911536] Created alert : (:OHAS00117:) : TIMED OUT WAITING FOR OHASD MONITOR
5. ohasOUT.log report:
2013-11-04 08:59:14
Changing directory to /u01/app/11.2.0/grid/log/lc1n1/ohasd
OHASD starting
Timed out waiting for init.ohasd script to start; posting an alert
6. ohasd.bin keeps restarting, ohasd.log report:
2014-08-31 15:00:25.132: [ CRSSEC][733177600]{0:0:2} Exception: PrimaryGroupEntry constructor failed to validate group name with error: 0 groupId: 0x7f8df8022450 acl_string: pgrp:spec:r-x
2014-08-31 15:00:25.132: [ CRSSEC][733177600]{0:0:2} Exception: ACL entry creation failed for: pgrp:spec:r-x
2014-08-31 15:00:25.132: [ INIT][733177600]{0:0:2} Dump State Starting ...
7. Only the ohasd.bin is runing, but there is nothing written in ohasd.log. OS /var/log/messages shows:
2015-07-12 racnode1 logger: autorun file for ohasd is missing


Possible Causes:

1. For OL5/RHEL5/under and other platform, the file '/etc/inittab' does not contain the line similar to the following (platform dependent) :
h1:35:respawn:/etc/init.d/init.ohasd run >/dev/null 2>&1 </dev/null
For OL6/RHEL6+, upstart is not configed properly.
2. runlevel 3 has not been reached, some rc3 script is hanging
3. the init process (pid 1) did not spawn the process defined in /etc/inittab (h1) or a bad entry before init.ohasd like xx:wait:<process> blocked the start of init.ohasd
4. CRS autostart is disabled
5. The Oracle Local Registry ($GRID_HOME/cdata/<node>.olr) is missing or corrupted (check as root user via "ocrdump -local /tmp/olr.log", the /tmp/olr.log should contain all GI daemon processes related information, compare with a working cluster to verify)
6. root user was in group "spec" before but now the group "spec" has been removed, the old group for root user is still recorded in the OLR, this can be verified in OLR dump
7. HOSTNAME was null when init.ohasd started especially after a node reboot


Solutions:

1. For OL5/RHEL5 and under, add the following line to /etc/inittab
h1:35:respawn:/etc/init.d/init.ohasd run >/dev/null 2>&1 </dev/null
and then run "init q" as the root user.
For Linux OL6/RHEL6, please refer to Note 1607600.1
2. Run command 'ps -ef | grep rc' and kill any remaining rc3 scripts that appear to be stuck.
3. Remove the bad entry before init.ohasd. Consult with OS vendor if "init q" does not spawn "init.ohasd run" process. As a workaround,
start the init.ohasd manually, eg: as root user, run "/etc/init.d/init.ohasd run >/dev/null 2>&1 </dev/null &"
4. Enable CRS autostart:
# crsctl enable crs
# crsctl start crs
5. Restore OLR from backup, as root user: (refer to Note 1193643.1)
# crsctl stop crs -f
# touch <GRID_HOME>/cdata/<node>.olr
# chown root:oinstall <GRID_HOME>/cdata/<node>.olr
# ocrconfig -local -restore <GRID_HOME>/cdata/<node>/backup_<date>_<num>.olr
# crsctl start crs

If OLR backup does not exist for any reason, perform deconfig and rerun root.sh is required to recreate OLR, as root user:
# <GRID_HOME>/crs/install/rootcrs.pl -deconfig -force
# <GRID_HOME>/root.sh
6. Reinitialize/Recreate the OLR is required, using the same command as recreating OLR per above
7. Restart the init.ohasd process or add "sleep 30" in init.ohasd to allow hostname populated correctly before starting Clusterware, refer to Note 1427234.1
8. If above does not help, check OS messages for ohasd.bin logger message and manually execute crswrapexece.pl command mentioned in the OS message with LD_LIBRARY_PATH set to <GRID_HOME>/lib to continue debug.

Issue #2: CRS-4530: Communications failure contacting Cluster Synchronization Services daemon, ocssd.bin is not running

Symptoms:

1. Command '$GRID_HOME/bin/crsctl check crs' returns errors:
CRS-4638: Oracle High Availability Services is online
CRS-4535: Cannot communicate with Cluster Ready Services
CRS-4530: Communications failure contacting Cluster Synchronization Services daemon
CRS-4534: Cannot communicate with Event Manager
2. Command 'ps -ef | grep d.bin' does not show a line similar to:
oragrid 21543 1 1 22:24 ? 00:00:01 /u01/app/11.2.0/grid/bin/ocssd.bin
3. ocssd.bin is running but abort with message "CLSGPNP_CALL_AGAIN" in ocssd.log
4. ocssd.log shows:

2012-01-27 13:42:58.796: [ CSSD][19]clssnmvDHBValidateNCopy: node 1, racnode1, has a disk HB, but no network HB, DHB has rcfg 223132864, wrtcnt, 1112, LATS 783238209,
lastSeqNo 1111, uniqueness 1327692232, timestamp 1327693378/787089065

5. for 3 or more node cases, 2 nodes form cluster fine, the 3rd node joined then failed, ocssd.log show:

2012-02-09 11:33:53.048: [ CSSD][1120926016](:CSSNM00008:)clssnmCheckDskInfo: Aborting local node to avoid splitbrain. Cohort of 2 nodes with leader 2, racnode2, is smaller than
cohort of 2 nodes led by node 1, racnode1, based on map type 2
2012-02-09 11:33:53.048: [ CSSD][1120926016]###################################
2012-02-09 11:33:53.048: [ CSSD][1120926016]clssscExit: CSSD aborting from thread clssnmRcfgMgrThread

6. ocssd.bin startup timeout after 10minutes

2012-04-08 12:04:33.153: [ CSSD][1]clssscmain: Starting CSS daemon, version 11.2.0.3.0, in (clustered) mode with uniqueness value 1333911873
......
2012-04-08 12:14:31.994: [ CSSD][5]clssgmShutDown: Received abortive shutdown request from client.
2012-04-08 12:14:31.994: [ CSSD][5]###################################
2012-04-08 12:14:31.994: [ CSSD][5]clssscExit: CSSD aborting from thread GMClientListener
2012-04-08 12:14:31.994: [ CSSD][5]###################################
2012-04-08 12:14:31.994: [ CSSD][5](:CSSSC00012:)clssscExit: A fatal error occurred and the CSS daemon is terminating abnormally

7. alert<node>.log shows:
2014-02-05 06:16:56.815
[cssd(3361)]CRS-1714:Unable to discover any voting files, retrying discovery in 15 seconds; Details at (:CSSNM00070:) in /u01/app/11.2.0/grid/log/bdprod2/cssd/ocssd.log
...
2014-02-05 06:27:01.707
[ohasd(2252)]CRS-2765:Resource 'ora.cssdmonitor' has failed on server 'bdprod2'.
2014-02-05 06:27:02.075
[ohasd(2252)]CRS-2771:Maximum restart attempts reached for resource 'ora.cssd'; will not restart.

Possible Causes:

1. Voting disk is missing or inaccessible
2. Multicast is not working for private network for 11.2.0.2.x (expected behavior) or 11.2.0.3 PSU5/PSU6/PSU7 or 12.1.0.1 (due to Bug 16547309)
3. private network is not working, ping or traceroute <private host> shows destination unreachable. Or firewall is enable for private network while ping/traceroute work fine
4. gpnpd does not come up, stuck in dispatch thread, Bug 10105195
5. too many disks discovered via asm_diskstring or slow scan of disks due to Bug 13454354 on Solaris 11.2.0.3 only
6. In some cases, known bug could cause 2nd node ocssd.bin can not join the cluster after private network issue is fixed, refer to Note 1479380.1


Solutions:

1. restore the voting disk access by checking storage access, disk permissions etc.
If the disk is not accessible at OS level, please engage system administrator to restore the disk access.
If the voting disk is missing from the OCR ASM diskgroup, start CRS in exclusive mode and recreate the voting disk:
# crsctl start crs -excl
# crsctl replace votedisk <+OCRVOTE diskgroup>
2. Refer to Document 1212703.1 for multicast test and fix. For 11.2.0.3 PSU5/PSU6/PSU7 or 12.1.0.1, either enable multicast for private network or apply patch 16547309 or latest PSU.
3. Consult with the network administrator to restore private network access or disable firewall for private network (for Linux, check service iptables status and service ip6tables status)
4. Kill the gpnpd.bin process on surviving node, refer Document 10105195.8
Once above issues are resolved, restart Grid Infrastructure stack.
If ping/traceroute all work for private network, there is a failed 11.2.0.1 to 11.2.0.2 upgrade happened, please check out
Bug 13416559 for workaround
5. Limit the number of ASM disks scan by supplying a more specific asm_diskstring, refer to bug 13583387
For Solaris 11.2.0.3 only, please apply patch 13250497, see Note 1451367.1.
6. Refer to the solution and workaround in Note 1479380.1

Issue #3: CRS-4535: Cannot communicate with Cluster Ready Services, crsd.bin is not running

Symptoms:

1. Command '$GRID_HOME/bin/crsctl check crs' returns errors:
CRS-4638: Oracle High Availability Services is online
CRS-4535: Cannot communicate with Cluster Ready Services
CRS-4529: Cluster Synchronization Services is online
CRS-4534: Cannot communicate with Event Manager
2. Command 'ps -ef | grep d.bin' does not show a line similar to:
root 23017 1 1 22:34 ? 00:00:00 /u01/app/11.2.0/grid/bin/crsd.bin reboot
3. Even if the crsd.bin process exists, command 'crsctl stat res -t -init' shows:
ora.crsd
1 ONLINE INTERMEDIATE

Possible Causes:

1. ocssd.bin is not running or resource ora.cssd is not ONLINE
2. +ASM<n> instance can not startup due to various reason
3. OCR is inaccessible
4. Network configuration has been changed causing gpnp profile.xml mismatch
5. $GRID_HOME/crs/init/<host>.pid file for crsd has been removed or renamed manually, crsd.log shows: 'Error3 -2 writing PID to the file'
6. ocr.loc content mismatch with other cluster nodes. crsd.log shows: 'Shutdown CacheLocal. my hash ids don't match'
7. private network is pingable with normal ping command but not pingable with jumbo frame size (eg: ping -s 8900 <private ip>) when jumbo frame is enabled (MTU: 9000+). Or partial cluster nodes have jumbo frame set (MTU: 9000) and the problem node does not have jumbo frame set (MTU:1500)
8. On AIX 6.1 TL08 SP01 and AIX 7.1 TL02 SP01, due to truncation of multicast packets.
9. udp_sendspace is set to default 9216 on AIX platform


Solutions:

1. Check the solution for Issue 2, ensure ocssd.bin is running and ora.cssd is ONLINE
2. For 11.2.0.2+, ensure that the resource ora.cluster_interconnect.haip is ONLINE, refer to Document 1383737.1 for ASM startup issues related to HAIP.
Check if GRID_HOME/bin/oracle binary is linked with RAC option Document 284785.1
3. Ensure the OCR disk is available and accessible. If the OCR is lost for any reason, refer to Document 1062983.1 on how to restore the OCR.
4. Restore network configuration to be the same as interface defined in $GRID_HOME/gpnp/<node>/profiles/peer/profile.xml, refer to Document 283684.1 for private network modification.
5. touch the file with <host>.pid under $GRID_HOME/crs/init.
For 11.2.0.1, the file is owned by <grid> user.
For 11.2.0.2, the file is owned by root user.
6. Using ocrconfig -repair command to fix the ocr.loc content:
for example, as root user:
# ocrconfig -repair -add +OCR2 (to add an entry)
# ocrconfig -repair -delete +OCR2 (to remove an entry)
ohasd.bin needs to be up and running in order for above command to run.

Once above issues are resolved, either restart GI stack or start crsd.bin via:
# crsctl start res ora.crsd -init
7. Engage network admin to enable jumbo frame from switch layer if it is enabled at the network interface. If jumbo frame is not required, change MTU to 1500 for the private network on all nodes, then restart GI stack on all nodes.
8. On AIX 6.1 TL08 SP01 and AIX 7.1 TL02 SP01, apply AIX patch per Document 1528452.1 AIX 6.1 TL8 or 7.1 TL2: 11gR2 GI Second Node Fails to Join the Cluster as CRSD and EVMD are in INTERMEDIATE State
9. Increase udp_sendspace to recommended value, refer to Document 1280234.1

Issue #4: Agent or mdnsd.bin, gpnpd.bin, gipcd.bin not running

Symptoms:

1. orarootagent not running. ohasd.log shows:
2012-12-21 02:14:05.071: [ AGFW][24] {0:0:2} Created alert : (:CRSAGF00123:) : Failed to start the agent process: /grid/11.2.0/grid_2/bin/orarootagent Category: -1 Operation: fail Loc: canexec2 OS error: 0 Other : no exe permission, file [/grid/11.2.0/grid_2/bin/orarootagent]
2. mdnsd.bin, gpnpd.bin or gipcd.bin not running, here is a sample for mdnsd log file:
2012-12-31 21:37:27.601: [ clsdmt][1088776512]Creating PID [4526] file for home /u01/app/11.2.0/grid host lc1n1 bin mdns to /u01/app/11.2.0/grid/mdns/init/
2012-12-31 21:37:27.602: [ clsdmt][1088776512]Error3 -2 writing PID [4526] to the file []
2012-12-31 21:37:27.602: [ clsdmt][1088776512]Failed to record pid for MDNSD
or
2012-12-31 21:39:52.656: [ clsdmt][1099217216]Creating PID [4645] file for home /u01/app/11.2.0/grid host lc1n1 bin mdns to /u01/app/11.2.0/grid/mdns/init/
2012-12-31 21:39:52.656: [ clsdmt][1099217216]Writing PID [4645] to the file [/u01/app/11.2.0/grid/mdns/init/lc1n1.pid]
2012-12-31 21:39:52.656: [ clsdmt][1099217216]Failed to record pid for MDNSD
3. oraagent or appagent not running, crsd.log shows:
2012-12-01 00:06:24.462: [ AGFW][1164069184] {0:2:27} Created alert : (:CRSAGF00130:) : Failed to start the agent /u01/app/grid/11.2.0/bin/appagent_oracle

Possible Causes:

1. orarootagent missing execute permission
2. missing process associated <node>.pid file or the file has wrong ownership or permission
3. wrong permission/ownership within GRID_HOME
4. GRID_HOME disk space 100% full


Solutions:

1. Either compare the permission/ownership with a good node GRID_HOME and make correction accordingly or as root user:
# cd <GRID_HOME>/crs/install
# ./rootcrs.pl -unlock
# ./rootcrs.pl -patch
This will stop clusterware stack, set permssion/owership to root for required files and restart clusterware stack.
2. If the corresponding <node>.pid does not exist, touch the file with correct ownership and permission, otherwise correct the <node>.pid ownership/permission as required, then restart the clusterware stack.
Here is the list of <node>.pid file under <GRID_HOME>, owned by root:root, permission 644:
./ologgerd/init/<node>.pid
./osysmond/init/<node>.pid
./ctss/init/<node>.pid
./ohasd/init/<node>.pid
./crs/init/<node>.pid
Owned by <grid>:oinstall, permission 644:
./mdns/init/<node>.pid
./evm/init/<node>.pid
./gipc/init/<node>.pid
./gpnp/init/<node>.pid

3. For cause 3, please refer to solution 1.
4. Please clean up the disk space from GRID_HOME, particularly clean up old files under <GRID_HOME>/log/<node>/client/, <diag dest>/tnslsnr/<node>/<listener name>/alert/

Issue #5: ASM instance does not start, ora.asm is OFFLINE

Symptoms:

1. Command 'ps -ef | grep asm' shows no ASM processes
2. Command 'crsctl stat res -t -init' shows:
ora.asm
1 ONLINE OFFLINE


Possible Causes:

1. ASM spfile is corrupted
2. ASM discovery string is incorrect and therefore voting disk/OCR cannot be discovered
3. ASMlib configuration problem
4. ASM instances are using different cluster_interconnect, HAIP OFFLINE on 1 node causing the 2nd ASM instance could not start


Solutions:

1. Create a temporary pfile to start ASM instance, then recreate spfile, see Document 1095214.1 for more details.
2. Refer to Document 1077094.1 to correct the ASM discovery string.
3. Refer to Document 1050164.1 to fix ASMlib configuration.
4. Refer to Document 1383737.1 for solution. For more information about HAIP, please refer to Document 1210883.1





============

Oracle Clusterware Cannot Start on all Nodes: Network communication with node <NAME> missing for 90% of timeout interval (Doc ID 1507482.1) To BottomTo Bottom



Applies to:
Oracle Database - Enterprise Edition - Version 11.2.0.1 and later
Information in this document applies to any platform.
Purpose

This note is a troubleshooting guide for the following situation: Oracle Clusterware cannot be started on all nodes at once. For example, in a 2-node cluster, the Oracle Clusterware on the 2nd node won't start, or, attempting to start clusterware on the second node causes the first node's clusterware to shutdown.

In the clusterware alert log ($GRID_HOME/log/<hostname>/alert<hostname>.log) of one or more nodes where Oracle Clusterware is started, the following messages are seen:
2012-07-14 19:24:18.420
[cssd(6192)]CRS-1612:Network communication with node racnode02 (2) missing for 50% of timeout interval. Removal of this node from cluster in 14.500 seconds
2012-07-14 19:24:25.422
[cssd(6192)]CRS-1611:Network communication with node racnode02 (2) missing for 75% of timeout interval. Removal of this node from cluster in 7.500 seconds
2012-07-14 19:24:30.424
[cssd(6192)]CRS-1610:Network communication with node racnode02 (2) missing for 90% of timeout interval. Removal of this node from cluster in 2.500 seconds
2012-07-14 19:24:32.925
[cssd(6192)]CRS-1607:Node racnode02 is being evicted in cluster incarnation 179915229; details at (:CSSNM00007:) in /u01/app/gridhome/log/racnode01/cssd/ocssd.log.


In the clusterware alert log ($GRID_HOME/log/<hostname>/alert<hostname>.log) of the evicted node(s), the following messages are seen:
2012-07-14 19:24:29.282
[cssd(8625)]CRS-1608:This node was evicted by node 1, racnode01; details at (:CSSNM00005:) in /u01/app/gridhome/log/racnode02/cssd/ocssd.log.
2012-07-14 19:24:29.282
[cssd(8625)]CRS-1656:The CSS daemon is terminating due to a fatal error; Details at (:CSSSC00012:) in /u01/app/gridhome/log/racnode02/cssd/ocssd.log

Troubleshooting Steps

The Oracle clusterware cannot be up on two (or more) nodes if those nodes cannot communicate with each other over the interconnect.

The CRS-1612, CRS-1611, CRS-1610 messages "Network communication with node NAME(n) missing for PCT% of timeout interval" are warning that ocssd on that node cannot communicate with ocssd on the other node(s) over the interconnect. If this persists for the full timeout interval (usually thirty seconds - reference: Document 294430.1) then Oracle Clusteware is designed to evict one of the nodes.

Therefore, the issue that requires troubleshooting in such as case is: why the nodes cannot communicate over the interconnect


===== Clusterware do not start on ALL nodes after reboot (Doc ID 1676719.1) ========



Applies to:
Oracle Database - Enterprise Edition - Version 11.2.0.1 to 11.2.0.4 [Release 11.2]
Information in this document applies to any platform.
Symptoms

Clusterware fails to start on ALL nodes after the cluster nodes are rebooted. Nodes in cluster are mdbp01,mdbp02,mdbp03 and mdbp04 with cluster name as "crs".

$GRID_HOME/log/<node>/gpnpd/gpnpd.log report below errors:
2014-01-23 01:56:50.376: [ CLSXSEC][7]clsxsec_CtxCKVerify: [at clsxsec.c:1768] Result: (10017) CLSXERR_SEC_BSAFE_DATA. Failed to verify: nzerr=29237 vstat=2
2014-01-23 01:56:50.376: [ GPNP][7]clsgpnpd_validateProfile: [at clsgpnpd.c:2888] Result: (89) CLSGPNP_SIG_WALLETDIF. Profile failed to verify. prf=6000000000d99e30
2014-01-23 01:56:50.376: [ GPNP][7]clsgpnpd_putProfileDo: [at clsgpnpd.c:5336] Result: (89) CLSGPNP_SIG_WALLETDIF. PUT>>> REFUSED best p=6000000000d99e30 from "tcp://rdbp08:10653"
2014-01-23 01:56:50.378: [ GPNP][7]clsgpnp_profileCallUrlInt: [at clsgpnp.c:2104] put-profile call to url "tcp://rdbp06:50099" disco "mdns:service:gpnp._tcp.local.://rdbp06:50099/agent=gpnpd,cname=crs,host=rdbp06,pid=6049/gpnpd h:rdbp06 c:crs" [f=0 claimed- host:mdbp01 cname:crs seq:6 auth:CN=GPnP_peer]
2014-01-23 01:56:52.111: [ OCRMSG][3]GIPC error [29] msg [gipcretConnectionRefused]
2014-01-23 01:57:06.251: [ OCRMSG][3]GIPC error [29] msg [gipcretConnectionRefused]
2014-01-23 01:57:27.462: [ OCRMSG][3]GIPC error [29] msg [gipcretConnectionRefused]

$GRID_HOME/log/<node>/cssd/ocssd.log shows:

2014-01-23 02:37:21.387: [ CSSD][5]clssgmEvtInformation: reqtype (11) req (6000000000c081f0)
2014-01-23 02:37:21.387: [ CSSD][5]clssnmQueueNotification: type (11) 6000000000c081f0
2014-01-23 02:37:22.639: [ GPNP][1]clsgpnpm_newWiredMsg: [at clsgpnpm.c:741] Msg-reply has soap fault 10 (Operation returned Retry (error CLSGPNP_CALL_AGAIN)) [uri http://www.grid-pnp.org/2005/12/gpnp-errors#"]
2014-01-23 02:37:24.659: [ GPNP][1]clsgpnpm_newWiredMsg: [at clsgpnpm.c:741] Msg-reply has soap fault 10 (Operation returned Retry (error CLSGPNP_CALL_AGAIN)) [uri http://www.grid-pnp.org/2005/12/gpnp-errors#"]
2014-01-23 02:37:26.679: [ GPNP][1]clsgpnpm_newWiredMsg: [at clsgpnpm.c:741] Msg-reply has soap fault 10 (Operation returned Retry (error CLSGPNP_CALL_AGAIN)) [uri http://www.grid-pnp.org/2005/12/gpnp-errors#"]
...
...
2014-01-23 02:47:18.568: [ GPNP][1]clsgpnpm_newWiredMsg: [at clsgpnpm.c:741] Msg-reply has soap fault 10 (Operation returned Retry (error CLSGPNP_CALL_AGAIN)) [uri http://www.grid-pnp.org/2005/12/gpnp-errors#"]
2014-01-23 02:47:19.377: [ CSSD][5]clssgmExecuteClientRequest: MAINT recvd from proc 3 (6000000000c1bb00)
2014-01-23 02:47:19.377: [ CSSD][5]clssgmShutDown: Received abortive shutdown request from client.
2014-01-23 02:47:19.377: [ CSSD][5]###################################
2014-01-23 02:47:19.377: [ CSSD][5]clssscExit: CSSD aborting from thread GMClientListener
2014-01-23 02:47:19.377: [ CSSD][5]###################################
2014-01-23 02:47:19.377: [ CSSD][5](:CSSSC00012:)clssscExit: A fatal error occurred and the CSS daemon is terminating abnormally

Cause

Another cluster is using the same cluster name "crs", causes gpnpd from this cluster (contains nodes mdbp01,mdbp02,mdbp03 and mdbp04) trying to get gpnp profile from the other cluster (contains nodes rdbp04,rdbp06,rdbp08 and rdbp09), as they do not belong to the same cluster, this leads to profile validation failure, Grid Infrastructure can not start.

Solution

"Cluster name" should be unique across clusters. To fix the issue, clustername can be changed by following below steps:

1. On all remote nodes of the mdb* cluster (contains nodes mdbp01,mdbp02,mdbp03 and mdbp04), as root user execute:
# <$GRID_HOME>/crs/install/rootcrs.pl -deconfig -force -verbose

2. Once the above command finishes on all remote nodes, on local node of mdb* cluster, as root user execute:
# <$GRID_HOME>/crs/install/rootcrs.pl -deconfig -force -verbose -keepdg -lastnode

3. Reconfigure and change the "Cluster name" by running $GRID_HOME/crs/config/config.sh, refer to note 1354258.1 for details

4. Run root.sh as prompted on each node to complete the configuration.

Note 1, from 12.1, cluster GUID is used for cluster node discovery, hence cluster name will not be an issue.

Note 2, the solution also provides the steps for how to change the cluster name in a RAC environment



=========================

This document is intended for Clusterware/RAC Database Administrators and Oracle support engineers.
Details
Start up sequence:

In a nutshell, the operating system starts ohasd, ohasd starts agents to start up daemons (gipcd, mdnsd, gpnpd, ctssd, ocssd, crsd, evmd asm etc), and crsd starts agents that start user resources (database, SCAN, listener etc).

For detailed Grid Infrastructure clusterware startup sequence, please refer to note 1053147.1

Cluster status


To find out cluster and daemon status:
$GRID_HOME/bin/crsctl check crs
CRS-4638: Oracle High Availability Services is online
CRS-4537: Cluster Ready Services is online
CRS-4529: Cluster Synchronization Services is online
CRS-4533: Event Manager is online

$GRID_HOME/bin/crsctl stat res -t -init
--------------------------------------------------------------------------------
NAME TARGET STATE SERVER STATE_DETAILS
--------------------------------------------------------------------------------
Cluster Resources
--------------------------------------------------------------------------------
ora.asm
1 ONLINE ONLINE rac1 Started
ora.crsd
1 ONLINE ONLINE rac1
ora.cssd
1 ONLINE ONLINE rac1
ora.cssdmonitor
1 ONLINE ONLINE rac1
ora.ctssd
1 ONLINE ONLINE rac1 OBSERVER
ora.diskmon
1 ONLINE ONLINE rac1
ora.drivers.acfs
1 ONLINE ONLINE rac1
ora.evmd
1 ONLINE ONLINE rac1
ora.gipcd
1 ONLINE ONLINE rac1
ora.gpnpd
1 ONLINE ONLINE rac1
ora.mdnsd
1 ONLINE ONLINE rac1

For 11.2.0.2 and above, there will be two more processes:

ora.cluster_interconnect.haip
1 ONLINE ONLINE rac1
ora.crf
1 ONLINE ONLINE rac1

For 11.2.0.3 onward in non-Exadata, ora.diskmon will be offline:

ora.diskmon
1 OFFLINE OFFLINE rac1

For 12c onward, ora.storage is introduced:

ora.storage
1 ONLINE ONLINE racnode1 STABLE



To start an offline daemon - if ora.crsd is OFFLINE:
$GRID_HOME/bin/crsctl start res ora.crsd -init

Case 1: OHASD does not start


As ohasd.bin is responsible to start up all other cluserware processes directly or indirectly, it needs to start up properly for the rest of the stack to come up. If ohasd.bin is not up, when checking its status, CRS-4639 (Could not contact Oracle High Availability Services) will be reported; and if ohasd.bin is already up, CRS-4640 will be reported if another start up attempt is made; if it fails to start, the following will be reported:

CRS-4124: Oracle High Availability Services startup failed.
CRS-4000: Command Start failed, or completed with errors.



Automatic ohasd.bin start up depends on the following:

1. OS is at appropriate run level:

OS need to be at specified run level before CRS will try to start up.

To find out at which run level the clusterware needs to come up:
cat /etc/inittab|grep init.ohasd
h1:35:respawn:/etc/init.d/init.ohasd run >/dev/null 2>&1 </dev/null

Note: Oracle Linux 6 (OL6) or Red Hat Linux 6 (RHEL6) has deprecated inittab, rather, init.ohasd will be configured via upstart in /etc/init/oracle-ohasd.conf, however, the process ""/etc/init.d/init.ohasd run" should still be up. Oracle Linux 7 (and Red Hat Linux 7) uses systemd to manage start/stop services (example: /etc/systemd/system/oracle-ohasd.service)

Above example shows CRS suppose to run at run level 3 and 5; please note depend on platform, CRS comes up at different run level.

To find out current run level:
who -r



2. "init.ohasd run" is up

On Linux/UNIX, as "init.ohasd run" is configured in /etc/inittab, process init (pid 1, /sbin/init on Linux, Solaris and hp-ux, /usr/sbin/init on AIX) will start and respawn "init.ohasd run" if it fails. Without "init.ohasd run" up and running, ohasd.bin will not start:

ps -ef|grep init.ohasd|grep -v grep
root 2279 1 0 18:14 ? 00:00:00 /bin/sh /etc/init.d/init.ohasd run

Note: Oracle Linux 6 (OL6) or Red Hat Linux 6 (RHEL6) has deprecated inittab, rather, init.ohasd will be configured via upstart in /etc/init/oracle-ohasd.conf, however, the process ""/etc/init.d/init.ohasd run" should still be up.

If any rc Snncommand script (located in rcn.d, example S98gcstartup) stuck, init process may not start "/etc/init.d/init.ohasd run"; please engage OS vendor to find out why relevant Snncommand script stuck.

Error "[ohasd(<pid>)] CRS-0715:Oracle High Availability Service has timed out waiting for init.ohasd to be started." may be reported of init.ohasd fails to start on time.
If SA can not identify the reason why init.ohasd is not starting, the following can be a very short term workaround:
cd <location-of-init.ohasd>
nohup ./init.ohasd run &



3. Cluserware auto start is enabled - it's enabled by default

By default CRS is enabled for auto start upon node reboot, to enable:
$GRID_HOME/bin/crsctl enable crs


To verify whether it's currently enabled or not:
$GRID_HOME/bin/crsctl config crs


If the following is in OS messages file
Feb 29 16:20:36 racnode1 logger: Oracle Cluster Ready Services startup disabled.
Feb 29 16:20:36 racnode1 logger: Could not access /var/opt/oracle/scls_scr/racnode1/root/ohasdstr


The reason is the file does not exist or not accessible, cause can be someone modified it manually or wrong opatch is used to apply a GI patch(i.e. opatch for Solaris X64 used to apply patch on Linux).




4. syslogd is up and OS is able to execute init script S96ohasd

OS may stuck with some other Snn script while node is coming up, thus never get chance to execute S96ohasd; if that's the case, following message will not be in OS messages:
Jan 20 20:46:51 rac1 logger: Oracle HA daemon is enabled for autostart.


If you don't see above message, the other possibility is syslogd(/usr/sbin/syslogd) is not fully up. Grid may fail to come up in that case as well. This may not apply to AIX.

To find out whether OS is able to execute S96ohasd while node is coming up, modify S96ohasd:

From:
case `$CAT $AUTOSTARTFILE` in
enable*)
$LOGERR "Oracle HA daemon is enabled for autostart."


To:
case `$CAT $AUTOSTARTFILE` in
enable*)
/bin/touch /tmp/ohasd.start."`date`"
$LOGERR "Oracle HA daemon is enabled for autostart."


After a node reboot, if you don't see /tmp/ohasd.start.timestamp get created, it means OS stuck with some other Snn script. If you do see /tmp/ohasd.start.timestamp but not "Oracle HA daemon is enabled for autostart" in messages, likely syslogd is not fully up. For both case, you will need engage System Administrator to find out the issue on OS level. For latter case, the workaround is to "sleep" for about 2 minutes, modify ohasd:

From:
case `$CAT $AUTOSTARTFILE` in
enable*)
$LOGERR "Oracle HA daemon is enabled for autostart."


To:
case `$CAT $AUTOSTARTFILE` in
enable*)
/bin/sleep 120
$LOGERR "Oracle HA daemon is enabled for autostart."


5. File System that GRID_HOME resides is online when init script S96ohasd is executed; once S96ohasd is executed, following message should be in OS messages file:
Jan 20 20:46:51 rac1 logger: Oracle HA daemon is enabled for autostart.
..
Jan 20 20:46:57 rac1 logger: exec /ocw/grid/perl/bin/perl -I/ocw/grid/perl/lib /ocw/grid/bin/crswrapexece.pl /ocw/grid/crs/install/s_crsconfig_rac1_env.txt /ocw/grid/bin/ohasd.bin "reboot"


If you see the first line, but not the last line, likely the filesystem containing the GRID_HOME was not online while S96ohasd is executed.


6. Oracle Local Registry (OLR, $GRID_HOME/cdata/${HOSTNAME}.olr) is accessible and valid

ls -l $GRID_HOME/cdata/*.olr
-rw------- 1 root oinstall 272756736 Feb 2 18:20 rac1.olr


If the OLR is inaccessible or corrupted, likely ohasd.log will have similar messages like following:

..
2010-01-24 22:59:10.470: [ default][1373676464] Initializing OLR
2010-01-24 22:59:10.472: [ OCROSD][1373676464]utopen:6m':failed in stat OCR file/disk /ocw/grid/cdata/rac1.olr, errno=2, os err string=No such file or directory
2010-01-24 22:59:10.472: [ OCROSD][1373676464]utopen:7:failed to open any OCR file/disk, errno=2, os err string=No such file or directory
2010-01-24 22:59:10.473: [ OCRRAW][1373676464]proprinit: Could not open raw device
2010-01-24 22:59:10.473: [ OCRAPI][1373676464]a_init:16!: Backend init unsuccessful : [26]
2010-01-24 22:59:10.473: [ CRSOCR][1373676464] OCR context init failure. Error: PROCL-26: Error while accessing the physical storage Operating System error [No such file or directory] [2]
2010-01-24 22:59:10.473: [ default][1373676464] OLR initalization failured, rc=26
2010-01-24 22:59:10.474: [ default][1373676464]Created alert : (:OHAS00106:) : Failed to initialize Oracle Local Registry
2010-01-24 22:59:10.474: [ default][1373676464][PANIC] OHASD exiting; Could not init OLR


OR

..
2010-01-24 23:01:46.275: [ OCROSD][1228334000]utread:3: Problem reading buffer 1907f000 buflen 4096 retval 0 phy_offset 102400 retry 5
2010-01-24 23:01:46.275: [ OCRRAW][1228334000]propriogid:1_1: Failed to read the whole bootblock. Assumes invalid format.
2010-01-24 23:01:46.275: [ OCRRAW][1228334000]proprioini: all disks are not OCR/OLR formatted
2010-01-24 23:01:46.275: [ OCRRAW][1228334000]proprinit: Could not open raw device
2010-01-24 23:01:46.275: [ OCRAPI][1228334000]a_init:16!: Backend init unsuccessful : [26]
2010-01-24 23:01:46.276: [ CRSOCR][1228334000] OCR context init failure. Error: PROCL-26: Error while accessing the physical storage
2010-01-24 23:01:46.276: [ default][1228334000] OLR initalization failured, rc=26
2010-01-24 23:01:46.276: [ default][1228334000]Created alert : (:OHAS00106:) : Failed to initialize Oracle Local Registry
2010-01-24 23:01:46.277: [ default][1228334000][PANIC] OHASD exiting; Could not init OLR


OR

..
2010-11-07 03:00:08.932: [ default][1] Created alert : (:OHAS00102:) : OHASD is not running as privileged user
2010-11-07 03:00:08.932: [ default][1][PANIC] OHASD exiting: must be run as privileged user


OR

ohasd.bin comes up but output of "crsctl stat res -t -init"shows no resource, and "ocrconfig -local -manualbackup" fails


OR

..
2010-08-04 13:13:11.102: [ CRSPE][35] Resources parsed
2010-08-04 13:13:11.103: [ CRSPE][35] Server [] has been registered with the PE data model
2010-08-04 13:13:11.103: [ CRSPE][35] STARTUPCMD_REQ = false:
2010-08-04 13:13:11.103: [ CRSPE][35] Server [] has changed state from [Invalid/unitialized] to [VISIBLE]
2010-08-04 13:13:11.103: [ CRSOCR][31] Multi Write Batch processing...
2010-08-04 13:13:11.103: [ default][35] Dump State Starting ...
..
2010-08-04 13:13:11.112: [ CRSPE][35] SERVERS:
:VISIBLE:address{{Absolute|Node:0|Process:-1|Type:1}}; recovered state:VISIBLE. Assigned to no pool

------------- SERVER POOLS:
Free [min:0][max:-1][importance:0] NO SERVERS ASSIGNED

2010-08-04 13:13:11.113: [ CRSPE][35] Dumping ICE contents...:ICE operation count: 0
2010-08-04 13:13:11.113: [ default][35] Dump State Done.


The solution is to restore a good backup of OLR with "ocrconfig -local -restore <ocr_backup_name>".
By default, OLR will be backed up to $GRID_HOME/cdata/$HOST/backup_$TIME_STAMP.olr once installation is complete.

7. ohasd.bin is able to access network socket files:

2010-06-29 10:31:01.570: [ COMMCRS][1206901056]clsclisten: Permission denied for (ADDRESS=(PROTOCOL=ipc)(KEY=procr_local_conn_0_PROL))

2010-06-29 10:31:01.571: [ OCRSRV][1217390912]th_listen: CLSCLISTEN failed clsc_ret= 3, addr= [(ADDRESS=(PROTOCOL=ipc)(KEY=procr_local_conn_0_PROL))]
2010-06-29 10:31:01.571: [ OCRSRV][3267002960]th_init: Local listener did not reach valid state


In Grid Infrastructure cluster environment, ohasd related socket files should be owned by root, but in Oracle Restart environment, they should be owned by grid user, refer to "Network Socket File Location, Ownership and Permission" section for example output.

8. ohasd.bin is able to access log file location:

OS messages/syslog shows:
Feb 20 10:47:08 racnode1 OHASD[9566]: OHASD exiting; Directory /ocw/grid/log/racnode1/ohasd not found.


Refer to "Log File Location, Ownership and Permission" section for example output, if the expected directory is missing, create it with proper ownership and permission.

9. After node reboot, ohasd may fail to start on SUSE Linux after node reboot, refer to note 1325718.1 - OHASD not Starting After Reboot on SLES

10. OHASD fails to start, "ps -ef| grep ohasd.bin" shows ohasd.bin is started, but nothing in $GRID_HOME/log/<node>/ohasd/ohasd.log for many minutes, truss shows it is looping to close non-opened file handles:

..
15058/1: 0.1995 close(2147483646) Err#9 EBADF
15058/1: 0.1996 close(2147483645) Err#9 EBADF
..


Call stack of ohasd.bin from pstack shows the following:
_close sclssutl_closefiledescriptors main ..


The cause is bug 11834289 which is fixed in 11.2.0.3 and above, other symptoms of the bug is clusterware processes may fail to start with same call stack and truss output (looping on OS call "close"). If the bug happens when trying to start other resources, "CRS-5802: Unable to start the agent process" could show up as well.

11. Other potential causes/solutions listed in note 1069182.1 - OHASD Failed to Start: Inappropriate ioctl for device

12. ohasd.bin started fine, however, "crsctl check crs" shows only the following and nothing else:
CRS-4638: Oracle High Availability Services is online

And "crsctl stat res -p -init" shows nothing

The cause is that OLR is corrupted, refer to note 1193643.1 to restore.

13. On EL7/OL7: note 1959008.1 - Install of Clusterware fails while running root.sh on OL7 - ohasd fails to start

14. For EL7/OL7, patch 25606616 is needed: TRACKING BUG TO PROVIDE GI FIXES FOR OL7

15. If ohasd still fails to start, refer to ohasd.log in <grid-home>/log/<nodename>/ohasd/ohasd.log and ohasdOUT.log


Case 2: OHASD Agents do not start


OHASD.BIN will spawn four agents/monitors to start resource:

oraagent: responsible for ora.asm, ora.evmd, ora.gipcd, ora.gpnpd, ora.mdnsd etc
orarootagent: responsible for ora.crsd, ora.ctssd, ora.diskmon, ora.drivers.acfs etc
cssdagent / cssdmonitor: responsible for ora.cssd(for ocssd.bin) and ora.cssdmonitor(for cssdmonitor itself)

If ohasd.bin can not start any of above agents properly, clusterware will not come to healthy state.

1. Common causes of agent failure are that the log file or log directory for the agents don't have proper ownership or permission.

Refer to below section "Log File Location, Ownership and Permission" for general reference.

One example is "rootcrs.pl -patch/postpatch" wasn't executed while patching manually resulting in agent start failure:

2015-02-25 15:43:54.350806 : CRSMAIN:3294918400: {0:0:2} {0:0:2} Created alert : (:CRSAGF00123:) : Failed to start the agent process: /ocw/grid/bin/orarootagent Category: -1 Operation: fail Loc: canexec2 OS error: 0 Other : no exe permission, file [/ocw/grid/bin/orarootagent]

2015-02-25 15:43:54.382154 : CRSMAIN:3294918400: {0:0:2} {0:0:2} Created alert : (:CRSAGF00123:) : Failed to start the agent process: /ocw/grid/bin/cssdagent Category: -1 Operation: fail Loc: canexec2 OS error: 0 Other : no exe permission, file [/ocw/grid/bin/cssdagent]

2015-02-25 15:43:54.384105 : CRSMAIN:3294918400: {0:0:2} {0:0:2} Created alert : (:CRSAGF00123:) : Failed to start the agent process: /ocw/grid/bin/cssdmonitor Category: -1 Operation: fail Loc: canexec2 OS error: 0 Other : no exe permission, file [/ocw/grid/bin/cssdmonitor]


The solution is to execute the missed steps.



2. If agent binary (oraagent.bin or orarootagent.bin etc) is corrupted, agent will not start resulting in related resources not coming up:
2011-05-03 11:11:13.189
[ohasd(25303)]CRS-5828:Could not start agent '/ocw/grid/bin/orarootagent_grid'. Details at (:CRSAGF00130:) {0:0:2} in /ocw/grid/log/racnode1/ohasd/ohasd.log.


2011-05-03 12:03:17.491: [ AGFW][1117866336] {0:0:184} Created alert : (:CRSAGF00130:) : Failed to start the agent /ocw/grid/bin/orarootagent_grid
2011-05-03 12:03:17.491: [ AGFW][1117866336] {0:0:184} Agfw Proxy Server sending the last reply to PE for message:RESOURCE_START[ora.diskmon 1 1] ID 4098:403
2011-05-03 12:03:17.491: [ AGFW][1117866336] {0:0:184} Can not stop the agent: /ocw/grid/bin/orarootagent_grid because pid is not initialized
..
2011-05-03 12:03:17.492: [ CRSPE][1128372576] {0:0:184} Fatal Error from AGFW Proxy: Unable to start the agent process
2011-05-03 12:03:17.492: [ CRSPE][1128372576] {0:0:184} CRS-2674: Start of 'ora.diskmon' on 'racnode1' failed

..

2011-06-27 22:34:57.805: [ AGFW][1131669824] {0:0:2} Created alert : (:CRSAGF00123:) : Failed to start the agent process: /ocw/grid/bin/cssdagent Category: -1 Operation: fail Loc: canexec2 OS error: 0 Other : no exe permission, file [/ocw/grid/bin/cssdagent]
2011-06-27 22:34:57.805: [ AGFW][1131669824] {0:0:2} Created alert : (:CRSAGF00126:) : Agent start failed
..
2011-06-27 22:34:57.806: [ AGFW][1131669824] {0:0:2} Created alert : (:CRSAGF00123:) : Failed to start the agent process: /ocw/grid/bin/cssdmonitor Category: -1 Operation: fail Loc: canexec2 OS error: 0 Other : no exe permission, file [/ocw/grid/bin/cssdmonitor]


The solution is to compare agent binary with a "good" node, and restore a good copy.

truss/strace of ohasd shows agent binary is corrupted
32555 17:38:15.953355 execve("/ocw/grid/bin/orarootagent.bin",
["/opt/grid/product/112020/grid/bi"...], [/* 38 vars */]) = 0
..
32555 17:38:15.954151 --- SIGBUS (Bus error) @ 0 (0) ---

3. Agent may fail to start due to bug 11834289 with error "CRS-5802: Unable to start the agent process", refer to Section "OHASD does not start" #10 for details.

4. Refer to: note 1964240.1 - CRS-5823:Could not initialize agent framework

Case 3: OCSSD.BIN does not start


Successful cssd.bin startup depends on the following:

1. GPnP profile is accessible - gpnpd needs to be fully up to serve profile

If ocssd.bin is able to get the profile successfully, likely ocssd.log will have similar messages like following:
2010-02-02 18:00:16.251: [ GPnP][408926240]clsgpnpm_exchange: [at clsgpnpm.c:1175] Calling "ipc://GPNPD_rac1", try 4 of 500...
2010-02-02 18:00:16.263: [ GPnP][408926240]clsgpnp_profileVerifyForCall: [at clsgpnp.c:1867] Result: (87) CLSGPNP_SIG_VALPEER. Profile verified. prf=0x165160d0
2010-02-02 18:00:16.263: [ GPnP][408926240]clsgpnp_profileGetSequenceRef: [at clsgpnp.c:841] Result: (0) CLSGPNP_OK. seq of p=0x165160d0 is '6'=6
2010-02-02 18:00:16.263: [ GPnP][408926240]clsgpnp_profileCallUrlInt: [at clsgpnp.c:2186] Result: (0) CLSGPNP_OK. Successful get-profile CALL to remote "ipc://GPNPD_rac1" disco ""


Otherwise messages like following will show in ocssd.log
2010-02-03 22:26:17.057: [ GPnP][3852126240]clsgpnpm_connect: [at clsgpnpm.c:1100] GIPC gipcretConnectionRefused (29) gipcConnect(ipc-ipc://GPNPD_rac1)
2010-02-03 22:26:17.057: [ GPnP][3852126240]clsgpnpm_connect: [at clsgpnpm.c:1101] Result: (48) CLSGPNP_COMM_ERR. Failed to connect to call url "ipc://GPNPD_rac1"
2010-02-03 22:26:17.057: [ GPnP][3852126240]clsgpnp_getProfileEx: [at clsgpnp.c:546] Result: (13) CLSGPNP_NO_DAEMON. Can't get GPnP service profile from local GPnP daemon
2010-02-03 22:26:17.057: [ default][3852126240]Cannot get GPnP profile. Error CLSGPNP_NO_DAEMON (GPNPD daemon is not running).
2010-02-03 22:26:17.057: [ CSSD][3852126240]clsgpnp_getProfile failed, rc(13)

The solution is to ensure gpnpd is up and running properly.


2. Voting Disk is accessible

In 11gR2, ocssd.bin discover voting disk with setting from GPnP profile, if not enough voting disks can be identified, ocssd.bin will abort itself.
2010-02-03 22:37:22.212: [ CSSD][2330355744]clssnmReadDiscoveryProfile: voting file discovery string(/share/storage/di*)
..
2010-02-03 22:37:22.227: [ CSSD][1145538880]clssnmvDiskVerify: Successful discovery of 0 disks
2010-02-03 22:37:22.227: [ CSSD][1145538880]clssnmCompleteInitVFDiscovery: Completing initial voting file discovery
2010-02-03 22:37:22.227: [ CSSD][1145538880]clssnmvFindInitialConfigs: No voting files found
2010-02-03 22:37:22.228: [ CSSD][1145538880]###################################
2010-02-03 22:37:22.228: [ CSSD][1145538880]clssscExit: CSSD signal 11 in thread clssnmvDDiscThread


ocssd.bin may not come up with the following error if all nodes failed while there's a voting file change in progress:
2010-05-02 03:11:19.033: [ CSSD][1197668093]clssnmCompleteInitVFDiscovery: Detected voting file add in progress for CIN 0:1134513465:0, waiting for configuration to complete 0:1134513098:0


The solution is to start ocssd.bin in exclusive mode with note 1364971.1


If the voting disk is located on a non-ASM device, ownership and permissions should be:

-rw-r----- 1 ogrid oinstall 21004288 Feb 4 09:13 votedisk1

3. Network is functional and name resolution is working:

If ocssd.bin can't bind to any network, likely the ocssd.log will have messages like following:
2010-02-03 23:26:25.804: [GIPCXCPT][1206540320]gipcmodGipcPassInitializeNetwork: failed to find any interfaces in clsinet, ret gipcretFail (1)
2010-02-03 23:26:25.804: [GIPCGMOD][1206540320]gipcmodGipcPassInitializeNetwork: EXCEPTION[ ret gipcretFail (1) ] failed to determine host from clsinet, using default
..
2010-02-03 23:26:25.810: [ CSSD][1206540320]clsssclsnrsetup: gipcEndpoint failed, rc 39
2010-02-03 23:26:25.811: [ CSSD][1206540320]clssnmOpenGIPCEndp: failed to listen on gipc addr gipc://rac1:nm_eotcs- ret 39
2010-02-03 23:26:25.811: [ CSSD][1206540320]clssscmain: failed to open gipc endp



If there's connectivity issue on private network (including multicast is off), likely the ocssd.log will have messages like following:
2010-09-20 11:52:54.014: [ CSSD][1103055168]clssnmvDHBValidateNCopy: node 1, racnode1, has a disk HB, but no network HB, DHB has rcfg 180441784, wrtcnt, 453, LATS 328297844, lastSeqNo 452, uniqueness 1284979488, timestamp 1284979973/329344894
2010-09-20 11:52:54.016: [ CSSD][1078421824]clssgmWaitOnEventValue: after CmInfo State val 3, eval 1 waited 0
.. >>>> after a long delay
2010-09-20 12:02:39.578: [ CSSD][1103055168]clssnmvDHBValidateNCopy: node 1, racnode1, has a disk HB, but no network HB, DHB has rcfg 180441784, wrtcnt, 1037, LATS 328883434, lastSeqNo 1036, uniqueness 1284979488, timestamp 1284980558/329930254
2010-09-20 12:02:39.895: [ CSSD][1107286336]clssgmExecuteClientRequest: MAINT recvd from proc 2 (0xe1ad870)
2010-09-20 12:02:39.895: [ CSSD][1107286336]clssgmShutDown: Received abortive shutdown request from client.
2010-09-20 12:02:39.895: [ CSSD][1107286336]###################################
2010-09-20 12:02:39.895: [ CSSD][1107286336]clssscExit: CSSD aborting from thread GMClientListener
2010-09-20 12:02:39.895: [ CSSD][1107286336]###################################


To validate network, please refer to note 1054902.1
Please also check if the network interface name is matching the gpnp profile definition ("gpnptool get") for cluster_interconnect if CSSD could not start after a network change.

In 11.2.0.1, ocssd.bin may bind to public network if private network is unavailable

4. Vendor clusterware is up (if using vendor clusterware)

Grid Infrastructure provide full clusterware functionality and doesn't need Vendor clusterware to be installed; but if you happened to have Grid Infrastructure on top of Vendor clusterware in your environment, then Vendor clusterware need to come up fully before CRS can be started, to verify, as grid user:
$GRID_HOME/bin/lsnodes -n
racnode1 1
racnode1 0


If vendor clusterware is not fully up, likely ocssd.log will have similar messages like following:
2010-08-30 18:28:13.207: [ CSSD][36]clssnm_skgxninit: skgxncin failed, will retry
2010-08-30 18:28:14.207: [ CSSD][36]clssnm_skgxnmon: skgxn init failed
2010-08-30 18:28:14.208: [ CSSD][36]###################################
2010-08-30 18:28:14.208: [ CSSD][36]clssscExit: CSSD signal 11 in thread skgxnmon


Before the clusterware is installed, execute the command below as grid user:
$INSTALL_SOURCE/install/lsnodes -v


One issue on hp-ux: note 2130230.1 - Grid infrastructure startup fails due to vendor Clusterware did not start (HP-UX Service guard)


5. Command "crsctl" being executed from wrong GRID_HOME

Command "crsctl" must be executed from correct GRID_HOME to start the stack, or similar message will be reported:
2012-11-14 10:21:44.014: [ CSSD][1086675264]ASSERT clssnm1.c 3248
2012-11-14 10:21:44.014: [ CSSD][1086675264](:CSSNM00056:)clssnmvStartDiscovery: Terminating because of the release version(11.2.0.2.0) of this node being lesser than the active version(11.2.0.3.0) that the cluster is at
2012-11-14 10:21:44.014: [ CSSD][1086675264]###################################
2012-11-14 10:21:44.014: [ CSSD][1086675264]clssscExit: CSSD aborting from thread clssnmvDDiscThread#

Case 4: CRSD.BIN does not start

If the "crsctl stat res -t -init" shows that ora.crsd is in intermediate state and if this is not the first node where crsd is starting, then a likely cause is that the csrd.bin is not able to talk to the master crsd.bin.
In this case, the master crsd.bin is likely having a problem, so killing the master crsd.bin is a likely solution.
Issue "grep MASTER crsd.trc" to find out the node where the master crsd.bin is running. Kill the crsd.bin on that master node.
The crsd.bin will automatically respawn although the master will be transferred to crsd.bin on another node.


Successful crsd.bin startup depends on the following:

1. ocssd is fully up

If ocssd.bin is not fully up, crsd.log will show messages like following:
2010-02-03 22:37:51.638: [ CSSCLNT][1548456880]clssscConnect: gipc request failed with 29 (0x16)
2010-02-03 22:37:51.638: [ CSSCLNT][1548456880]clsssInitNative: connect failed, rc 29
2010-02-03 22:37:51.639: [ CRSRTI][1548456880] CSS is not ready. Received status 3 from CSS. Waiting for good status ..



2. OCR is accessible

If the OCR is located on ASM, ora.asm resource (ASM instance) must be up and diskgroup for OCR must be mounted, if not, likely the crsd.log will show messages like:
2010-02-03 22:22:55.186: [ OCRASM][2603807664]proprasmo: Error in open/create file in dg [GI]
[ OCRASM][2603807664]SLOS : SLOS: cat=7, opn=kgfoAl06, dep=15077, loc=kgfokge
ORA-15077: could not locate ASM instance serving a required diskgroup

2010-02-03 22:22:55.189: [ OCRASM][2603807664]proprasmo: kgfoCheckMount returned [7]
2010-02-03 22:22:55.189: [ OCRASM][2603807664]proprasmo: The ASM instance is down
2010-02-03 22:22:55.190: [ OCRRAW][2603807664]proprioo: Failed to open [+GI]. Returned proprasmo() with [26]. Marking location as UNAVAILABLE.
2010-02-03 22:22:55.190: [ OCRRAW][2603807664]proprioo: No OCR/OLR devices are usable
2010-02-03 22:22:55.190: [ OCRASM][2603807664]proprasmcl: asmhandle is NULL
2010-02-03 22:22:55.190: [ OCRRAW][2603807664]proprinit: Could not open raw device
2010-02-03 22:22:55.190: [ OCRASM][2603807664]proprasmcl: asmhandle is NULL
2010-02-03 22:22:55.190: [ OCRAPI][2603807664]a_init:16!: Backend init unsuccessful : [26]
2010-02-03 22:22:55.190: [ CRSOCR][2603807664] OCR context init failure. Error: PROC-26: Error while accessing the physical storage ASM error [SLOS: cat=7, opn=kgfoAl06, dep=15077, loc=kgfokge
ORA-15077: could not locate ASM instance serving a required diskgroup
] [7]
2010-02-03 22:22:55.190: [ CRSD][2603807664][PANIC] CRSD exiting: Could not init OCR, code: 26


Note: in 11.2 ASM starts before crsd.bin, and brings up the diskgroup automatically if it contains the OCR.

If the OCR is located on a non-ASM device, expected ownership and permissions are:

-rw-r----- 1 root oinstall 272756736 Feb 3 23:24 ocr

If OCR is located on non-ASM device and it's unavailable, likely crsd.log will show similar message like following:
2010-02-03 23:14:33.583: [ OCROSD][2346668976]utopen:7:failed to open any OCR file/disk, errno=2, os err string=No such file or directory
2010-02-03 23:14:33.583: [ OCRRAW][2346668976]proprinit: Could not open raw device
2010-02-03 23:14:33.583: [ default][2346668976]a_init:7!: Backend init unsuccessful : [26]
2010-02-03 23:14:34.587: [ OCROSD][2346668976]utopen:6m':failed in stat OCR file/disk /share/storage/ocr, errno=2, os err string=No such file or directory
2010-02-03 23:14:34.587: [ OCROSD][2346668976]utopen:7:failed to open any OCR file/disk, errno=2, os err string=No such file or directory
2010-02-03 23:14:34.587: [ OCRRAW][2346668976]proprinit: Could not open raw device
2010-02-03 23:14:34.587: [ default][2346668976]a_init:7!: Backend init unsuccessful : [26]
2010-02-03 23:14:35.589: [ CRSD][2346668976][PANIC] CRSD exiting: OCR device cannot be initialized, error: 1:26



If the OCR is corrupted, likely crsd.log will show messages like the following:
2010-02-03 23:19:38.417: [ default][3360863152]a_init:7!: Backend init unsuccessful : [26]
2010-02-03 23:19:39.429: [ OCRRAW][3360863152]propriogid:1_2: INVALID FORMAT
2010-02-03 23:19:39.429: [ OCRRAW][3360863152]proprioini: all disks are not OCR/OLR formatted
2010-02-03 23:19:39.429: [ OCRRAW][3360863152]proprinit: Could not open raw device
2010-02-03 23:19:39.429: [ default][3360863152]a_init:7!: Backend init unsuccessful : [26]
2010-02-03 23:19:40.432: [ CRSD][3360863152][PANIC] CRSD exiting: OCR device cannot be initialized, error: 1:26



If owner or group of grid user got changed, even ASM is available, likely crsd.log will show following:
2010-03-10 11:45:12.510: [ OCRASM][611467760]proprasmo: Error in open/create file in dg [SYSTEMDG]
[ OCRASM][611467760]SLOS : SLOS: cat=7, opn=kgfoAl06, dep=1031, loc=kgfokge
ORA-01031: insufficient privileges

2010-03-10 11:45:12.528: [ OCRASM][611467760]proprasmo: kgfoCheckMount returned [7]
2010-03-10 11:45:12.529: [ OCRASM][611467760]proprasmo: The ASM instance is down
2010-03-10 11:45:12.529: [ OCRRAW][611467760]proprioo: Failed to open [+SYSTEMDG]. Returned proprasmo() with [26]. Marking location as UNAVAILABLE.
2010-03-10 11:45:12.529: [ OCRRAW][611467760]proprioo: No OCR/OLR devices are usable
2010-03-10 11:45:12.529: [ OCRASM][611467760]proprasmcl: asmhandle is NULL
2010-03-10 11:45:12.529: [ OCRRAW][611467760]proprinit: Could not open raw device
2010-03-10 11:45:12.529: [ OCRASM][611467760]proprasmcl: asmhandle is NULL
2010-03-10 11:45:12.529: [ OCRAPI][611467760]a_init:16!: Backend init unsuccessful : [26]
2010-03-10 11:45:12.530: [ CRSOCR][611467760] OCR context init failure. Error: PROC-26: Error while accessing the physical storage ASM error [SLOS: cat=7, opn=kgfoAl06, dep=1031, loc=kgfokge
ORA-01031: insufficient privileges
] [7]



If oracle binary in GRID_HOME has wrong ownership or permission regardless whether ASM is up and running, or if grid user can not write in ORACLE_BASE, likely crsd.log will show following:
2012-03-04 21:34:23.139: [ OCRASM][3301265904]proprasmo: Error in open/create file in dg [OCR]
[ OCRASM][3301265904]SLOS : SLOS: cat=7, opn=kgfoAl06, dep=12547, loc=kgfokge

2012-03-04 21:34:23.139: [ OCRASM][3301265904]ASM Error Stack : ORA-12547: TNS:lost contact

2012-03-04 21:34:23.633: [ OCRASM][3301265904]proprasmo: kgfoCheckMount returned [7]
2012-03-04 21:34:23.633: [ OCRASM][3301265904]proprasmo: The ASM instance is down
2012-03-04 21:34:23.634: [ OCRRAW][3301265904]proprioo: Failed to open [+OCR]. Returned proprasmo() with [26]. Marking location as UNAVAILABLE.
2012-03-04 21:34:23.634: [ OCRRAW][3301265904]proprioo: No OCR/OLR devices are usable
2012-03-04 21:34:23.635: [ OCRASM][3301265904]proprasmcl: asmhandle is NULL
2012-03-04 21:34:23.636: [ GIPC][3301265904] gipcCheckInitialization: possible incompatible non-threaded init from [prom.c : 690], original from [clsss.c : 5326]
2012-03-04 21:34:23.639: [ default][3301265904]clsvactversion:4: Retrieving Active Version from local storage.
2012-03-04 21:34:23.643: [ OCRRAW][3301265904]proprrepauto: The local OCR configuration matches with the configuration published by OCR Cache Writer. No repair required.
2012-03-04 21:34:23.645: [ OCRRAW][3301265904]proprinit: Could not open raw device
2012-03-04 21:34:23.646: [ OCRASM][3301265904]proprasmcl: asmhandle is NULL
2012-03-04 21:34:23.650: [ OCRAPI][3301265904]a_init:16!: Backend init unsuccessful : [26]
2012-03-04 21:34:23.651: [ CRSOCR][3301265904] OCR context init failure. Error: PROC-26: Error while accessing the physical storage
ORA-12547: TNS:lost contact

2012-03-04 21:34:23.652: [ CRSMAIN][3301265904] Created alert : (:CRSD00111:) : Could not init OCR, error: PROC-26: Error while accessing the physical storage
ORA-12547: TNS:lost contact

2012-03-04 21:34:23.652: [ CRSD][3301265904][PANIC] CRSD exiting: Could not init OCR, code: 26


The expected ownership and permission of oracle binary in GRID_HOME should be:
-rwsr-s--x 1 grid oinstall 184431149 Feb 2 20:37 /ocw/grid/bin/oracle


If OCR or mirror is unavailable (could be ASM is up, but diskgroup for OCR/mirror is unmounted), likely crsd.log will show following:
2010-05-11 11:16:38.578: [ OCRASM][18]proprasmo: Error in open/create file in dg [OCRMIR]
[ OCRASM][18]SLOS : SLOS: cat=8, opn=kgfoOpenFile01, dep=15056, loc=kgfokge
ORA-17503: ksfdopn:DGOpenFile05 Failed to open file +OCRMIR.255.4294967295
ORA-17503: ksfdopn:2 Failed to open file +OCRMIR.255.4294967295
ORA-15001: diskgroup "OCRMIR
..
2010-05-11 11:16:38.647: [ OCRASM][18]proprasmo: kgfoCheckMount returned [6]
2010-05-11 11:16:38.648: [ OCRASM][18]proprasmo: The ASM disk group OCRMIR is not found or not mounted
2010-05-11 11:16:38.648: [ OCRASM][18]proprasmdvch: Failed to open OCR location [+OCRMIR] error [26]
2010-05-11 11:16:38.648: [ OCRRAW][18]propriodvch: Error [8] returned device check for [+OCRMIR]
2010-05-11 11:16:38.648: [ OCRRAW][18]dev_replace: non-master could not verify the new disk (8)
[ OCRSRV][18]proath_invalidate_action: Failed to replace [+OCRMIR] [8]
[ OCRAPI][18]procr_ctx_set_invalid_no_abort: ctx set to invalid
..
2010-05-11 11:16:46.587: [ OCRMAS][19]th_master:91: Comparing device hash ids between local and master failed
2010-05-11 11:16:46.587: [ OCRMAS][19]th_master:91 Local dev (1862408427, 1028247821, 0, 0, 0)
2010-05-11 11:16:46.587: [ OCRMAS][19]th_master:91 Master dev (1862408427, 1859478705, 0, 0, 0)
2010-05-11 11:16:46.587: [ OCRMAS][19]th_master:9: Shutdown CacheLocal. my hash ids don't match
[ OCRAPI][19]procr_ctx_set_invalid_no_abort: ctx set to invalid
[ OCRAPI][19]procr_ctx_set_invalid: aborting...
2010-05-11 11:16:46.587: [ CRSD][19] Dump State Starting ...



3. crsd.bin pid file exists and points to running crsd.bin process

If pid file does not exist, $GRID_HOME/log/$HOST/agent/ohasd/orarootagent_root/orarootagent_root.log will have similar like the following:

2010-02-14 17:40:57.927: [ora.crsd][1243486528] [check] PID FILE doesn't exist.
..
2010-02-14 17:41:57.927: [ clsdmt][1092499776]Creating PID [30269] file for home /ocw/grid host racnode1 bin crs to /ocw/grid/crs/init/
2010-02-14 17:41:57.927: [ clsdmt][1092499776]Error3 -2 writing PID [30269] to the file []
2010-02-14 17:41:57.927: [ clsdmt][1092499776]Failed to record pid for CRSD
2010-02-14 17:41:57.927: [ clsdmt][1092499776]Terminating process
2010-02-14 17:41:57.927: [ default][1092499776] CRSD exiting on stop request from clsdms_thdmai


The solution is to create a dummy pid file ($GRID_HOME/crs/init/$HOST.pid) manually as grid user with "touch" command and restart resource ora.crsd

If pid file does exist and the PID in this file references a running process which is NOT the crsd.bin process, $GRID_HOME/log/$HOST/agent/ohasd/orarootagent_root/orarootagent_root.log will have similar like the following:
2011-04-06 15:53:38.777: [ora.crsd][1160390976] [check] PID will be looked for in /ocw/grid/crs/init/racnode1.pid
2011-04-06 15:53:38.778: [ora.crsd][1160390976] [check] PID which will be monitored will be 1535 >> 1535 is output of "cat /ocw/grid/crs/init/racnode1.pid"
2011-04-06 15:53:38.965: [ COMMCRS][1191860544]clsc_connect: (0x2aaab400b0b0) no listener at (ADDRESS=(PROTOCOL=ipc)(KEY=racnode1DBG_CRSD))
[ clsdmc][1160390976]Fail to connect (ADDRESS=(PROTOCOL=ipc)(KEY=racnode1DBG_CRSD)) with status 9
2011-04-06 15:53:38.966: [ora.crsd][1160390976] [check] Error = error 9 encountered when connecting to CRSD
2011-04-06 15:53:39.023: [ora.crsd][1160390976] [check] Calling PID check for daemon
2011-04-06 15:53:39.023: [ora.crsd][1160390976] [check] Trying to check PID = 1535
2011-04-06 15:53:39.203: [ora.crsd][1160390976] [check] PID check returned ONLINE CLSDM returned OFFLINE
2011-04-06 15:53:39.203: [ora.crsd][1160390976] [check] DaemonAgent::check returned 5
2011-04-06 15:53:39.203: [ AGFW][1160390976] check for resource: ora.crsd 1 1 completed with status: FAILED
2011-04-06 15:53:39.203: [ AGFW][1170880832] ora.crsd 1 1 state changed from: UNKNOWN to: FAILED
..
2011-04-06 15:54:10.511: [ AGFW][1167522112] ora.crsd 1 1 state changed from: UNKNOWN to: CLEANING
..
2011-04-06 15:54:10.513: [ora.crsd][1146542400] [clean] Trying to stop PID = 1535
..
2011-04-06 15:54:11.514: [ora.crsd][1146542400] [clean] Trying to check PID = 1535


To verify on OS level:
ls -l /ocw/grid/crs/init/*pid
-rwxr-xr-x 1 ogrid oinstall 5 Feb 17 11:00 /ocw/grid/crs/init/racnode1.pid
cat /ocw/grid/crs/init/*pid
1535
ps -ef| grep 1535
root 1535 1 0 Mar30 ? 00:00:00 iscsid >> Note process 1535 is not crsd.bin


The solution is to create an empty pid file and to restart the resource ora.crsd, as root:

# > $GRID_HOME/crs/init/<racnode1>.pid
# $GRID_HOME/bin/crsctl stop res ora.crsd -init
# $GRID_HOME/bin/crsctl start res ora.crsd -init



4. Network is functional and name resolution is working:

If the network is not fully functioning, ocssd.bin may still come up, but crsd.bin may fail and the crsd.log will show messages like:

2010-02-03 23:34:28.412: [ GPnP][2235814832]clsgpnp_Init: [at clsgpnp0.c:837] GPnP client pid=867, tl=3, f=0
2010-02-03 23:34:28.428: [ OCRAPI][2235814832]clsu_get_private_ip_addresses: no ip addresses found.
..
2010-02-03 23:34:28.434: [ OCRAPI][2235814832]a_init:13!: Clusterware init unsuccessful : [44]
2010-02-03 23:34:28.434: [ CRSOCR][2235814832] OCR context init failure. Error: PROC-44: Error in network address and interface operations Network address and interface operations error [7]
2010-02-03 23:34:28.434: [ CRSD][2235814832][PANIC] CRSD exiting: Could not init OCR, code: 44


Or:

2009-12-10 06:28:31.974: [ OCRMAS][20]proath_connect_master:1: could not connect to master clsc_ret1 = 9, clsc_ret2 = 9
2009-12-10 06:28:31.974: [ OCRMAS][20]th_master:11: Could not connect to the new master
2009-12-10 06:29:01.450: [ CRSMAIN][2] Policy Engine is not initialized yet!
2009-12-10 06:29:31.489: [ CRSMAIN][2] Policy Engine is not initialized yet!


Or:

2009-12-31 00:42:08.110: [ COMMCRS][10]clsc_receive: (102b03250) Error receiving, ns (12535, 12560), transport (505, 145, 0)


To validate the network, please refer to note 1054902.1

5. crsd executable (crsd.bin and crsd in GRID_HOME/bin) has correct ownership/permission and hasn't been manually modified, a simply way to check is to compare output of "ls -l <grid-home>/bin/crsd <grid-home>/bin/crsd.bin" with a "good" node.

6. crsd may not start due to the following:

note 1552472.1 -CRSD Will Not Start Following a Node Reboot: crsd.log reports: clsclisten: op 65 failed and/or Unable to get E2E port
note 1684332.1 - GI crsd Fails to Start: clsclisten: op 65 failed, NSerr (12560, 0), transport: (583, 0, 0)


7. To troubleshoot further, refer to note 1323698.1 - Troubleshooting CRSD Start up Issue
Case 5: GPNPD.BIN does not start

1. Name Resolution is not working

gpnpd.bin fails with following error in gpnpd.log:

2010-05-13 12:48:11.540: [ GPnP][1171126592]clsgpnpm_exchange: [at clsgpnpm.c:1175] Calling "tcp://node2:9393", try 1 of 3...
2010-05-13 12:48:11.540: [ GPnP][1171126592]clsgpnpm_connect: [at clsgpnpm.c:1015] ENTRY
2010-05-13 12:48:11.541: [ GPnP][1171126592]clsgpnpm_connect: [at clsgpnpm.c:1066] GIPC gipcretFail (1) gipcConnect(tcp-tcp://node2:9393)
2010-05-13 12:48:11.541: [ GPnP][1171126592]clsgpnpm_connect: [at clsgpnpm.c:1067] Result: (48) CLSGPNP_COMM_ERR. Failed to connect to call url "tcp://node2:9393"


In above example, please make sure current node is able to ping "node2", and no firewall between them.

2. Bug 10105195

Due to Bug 10105195, gpnp dispatch is single threaded and could be blocked by network scanning etc, the bug is fixed in 11.2.0.2 GI PSU2, 11.2.0.3 and above, refer to note 10105195.8 for more details.

Case 6: Various other daemons do not start

Common causes:

1. Log file or directory for the daemon doesn't have appropriate ownership or permission

If the log file or log directory for the daemon doesn't have proper ownership or permissions, usually there is no new info in the log file and the timestamp remains the same while the daemon tries to come up.

Refer to below section "Log File Location, Ownership and Permission" for general reference.


2. Network socket file doesn't have appropriate ownership or permission

In this case, the daemon log will show messages like:
2010-02-02 12:55:20.485: [ COMMCRS][1121433920]clsclisten: Permission denied for (ADDRESS=(PROTOCOL=ipc)(KEY=rac1DBG_GIPCD))

2010-02-02 12:55:20.485: [ clsdmt][1110944064]Fail to listen to (ADDRESS=(PROTOCOL=ipc)(KEY=rac1DBG_GIPCD))



3. OLR is corrupted

In this case, the daemon log will show messages like (this is a case that ora.ctssd fails to start):
2012-07-22 00:15:16.565: [ default][1]clsvactversion:4: Retrieving Active Version from local storage.
2012-07-22 00:15:16.575: [ CTSS][1]clsctss_r_av3: Invalid active version [] retrieved from OLR. Returns [19].
2012-07-22 00:15:16.585: [ CTSS][1](:ctss_init16:): Error [19] retrieving active version. Returns [19].
2012-07-22 00:15:16.585: [ CTSS][1]ctss_main: CTSS init failed [19]
2012-07-22 00:15:16.585: [ CTSS][1]ctss_main: CTSS daemon aborting [19].
2012-07-22 00:15:16.585: [ CTSS][1]CTSS daemon aborting


The solution is to restore a good copy of OLR note 1193643.1


4. Other cases:

note 1087521.1 - CTSS Daemon Aborting With "op 65 failed, NSerr (12560, 0), transport: (583, 0, 0)"

Case 7: CRSD Agents do not start


CRSD.BIN will spawn two agents to start up user resource -the two agent share same name and binary as ohasd.bin agents:

orarootagent: responsible for ora.netn.network, ora.nodename.vip, ora.scann.vip and ora.gns
oraagent: responsible for ora.asm, ora.eons, ora.ons, listener, SCAN listener, diskgroup, database, service resource etc

To find out the user resource status:
$GRID_HOME/crsctl stat res -t



If crsd.bin can not start any of the above agents properly, user resources may not come up.

1. Common cause of agent failure is that the log file or log directory for the agents don't have proper ownership or permissions.

Refer to below section "Log File Location, Ownership and Permission" for general reference.

2. Agent may fail to start due to bug 11834289 with error "CRS-5802: Unable to start the agent process", refer to Section "OHASD does not start" #10 for details.

Case 8: HAIP does not start

HAIP may fail to start with various errors, i.e.
[ohasd(891)]CRS-2807:Resource 'ora.cluster_interconnect.haip' failed to start automatically.

Refer to note 1210883.1 for more details of HAIP
Network and Naming Resolution Verification


CRS depends on a fully functional network and name resolution. If the network or name resolution is not fully functioning, CRS may not come up successfully.

To validate network and name resolution setup, please refer to note 1054902.1

Log File Location, Ownership and Permission


Appropriate ownership and permission of sub-directories and files in $GRID_HOME/log is critical for CRS components to come up properly.
In Grid Infrastructure cluster environment:

Assuming a Grid Infrastructure environment with node name rac1, CRS owner grid, and two separate RDBMS owner rdbmsap and rdbmsar, here's what it looks like under $GRID_HOME/log in cluster environment:

drwxrwxr-x 5 grid oinstall 4096 Dec 6 09:20 log
drwxr-xr-x 2 grid oinstall 4096 Dec 6 08:36 crs
drwxr-xr-t 17 root oinstall 4096 Dec 6 09:22 rac1
drwxr-x--- 2 grid oinstall 4096 Dec 6 09:20 admin
drwxrwxr-t 4 root oinstall 4096 Dec 6 09:20 agent
drwxrwxrwt 7 root oinstall 4096 Jan 26 18:15 crsd
drwxr-xr-t 2 grid oinstall 4096 Dec 6 09:40 application_grid
drwxr-xr-t 2 grid oinstall 4096 Jan 26 18:15 oraagent_grid
drwxr-xr-t 2 rdbmsap oinstall 4096 Jan 26 18:15 oraagent_rdbmsap
drwxr-xr-t 2 rdbmsar oinstall 4096 Jan 26 18:15 oraagent_rdbmsar
drwxr-xr-t 2 grid oinstall 4096 Jan 26 18:15 ora_oc4j_type_grid
drwxr-xr-t 2 root root 4096 Jan 26 20:09 orarootagent_root
drwxrwxr-t 6 root oinstall 4096 Dec 6 09:24 ohasd
drwxr-xr-t 2 grid oinstall 4096 Jan 26 18:14 oraagent_grid
drwxr-xr-t 2 root root 4096 Dec 6 09:24 oracssdagent_root
drwxr-xr-t 2 root root 4096 Dec 6 09:24 oracssdmonitor_root
drwxr-xr-t 2 root root 4096 Jan 26 18:14 orarootagent_root
-rw-rw-r-- 1 root root 12931 Jan 26 21:30 alertrac1.log
drwxr-x--- 2 grid oinstall 4096 Jan 26 20:44 client
drwxr-x--- 2 root oinstall 4096 Dec 6 09:24 crsd
drwxr-x--- 2 grid oinstall 4096 Dec 6 09:24 cssd
drwxr-x--- 2 root oinstall 4096 Dec 6 09:24 ctssd
drwxr-x--- 2 grid oinstall 4096 Jan 26 18:14 diskmon
drwxr-x--- 2 grid oinstall 4096 Dec 6 09:25 evmd
drwxr-x--- 2 grid oinstall 4096 Jan 26 21:20 gipcd
drwxr-x--- 2 root oinstall 4096 Dec 6 09:20 gnsd
drwxr-x--- 2 grid oinstall 4096 Jan 26 20:58 gpnpd
drwxr-x--- 2 grid oinstall 4096 Jan 26 21:19 mdnsd
drwxr-x--- 2 root oinstall 4096 Jan 26 21:20 ohasd
drwxrwxr-t 5 grid oinstall 4096 Dec 6 09:34 racg
drwxrwxrwt 2 grid oinstall 4096 Dec 6 09:20 racgeut
drwxrwxrwt 2 grid oinstall 4096 Dec 6 09:20 racgevtf
drwxrwxrwt 2 grid oinstall 4096 Dec 6 09:20 racgmain
drwxr-x--- 2 grid oinstall 4096 Jan 26 20:57 srvm

Please note most log files in sub-directory inherit ownership of parent directory; and above are just for general reference to tell whether there's unexpected recursive ownership and permission changes inside the CRS home . If you have a working node with the same version, the working node should be used as a reference.

In Oracle Restart environment:

And here's what it looks like under $GRID_HOME/log in Oracle Restart environment:

drwxrwxr-x 5 grid oinstall 4096 Oct 31 2009 log
drwxr-xr-x 2 grid oinstall 4096 Oct 31 2009 crs
drwxr-xr-x 3 grid oinstall 4096 Oct 31 2009 diag
drwxr-xr-t 17 root oinstall 4096 Oct 31 2009 rac1
drwxr-x--- 2 grid oinstall 4096 Oct 31 2009 admin
drwxrwxr-t 4 root oinstall 4096 Oct 31 2009 agent
drwxrwxrwt 2 root oinstall 4096 Oct 31 2009 crsd
drwxrwxr-t 8 root oinstall 4096 Jul 14 08:15 ohasd
drwxr-xr-x 2 grid oinstall 4096 Aug 5 13:40 oraagent_grid
drwxr-xr-x 2 grid oinstall 4096 Aug 2 07:11 oracssdagent_grid
drwxr-xr-x 2 grid oinstall 4096 Aug 3 21:13 orarootagent_grid
-rwxr-xr-x 1 grid oinstall 13782 Aug 1 17:23 alertrac1.log
drwxr-x--- 2 grid oinstall 4096 Nov 2 2009 client
drwxr-x--- 2 root oinstall 4096 Oct 31 2009 crsd
drwxr-x--- 2 grid oinstall 4096 Oct 31 2009 cssd
drwxr-x--- 2 root oinstall 4096 Oct 31 2009 ctssd
drwxr-x--- 2 grid oinstall 4096 Oct 31 2009 diskmon
drwxr-x--- 2 grid oinstall 4096 Oct 31 2009 evmd
drwxr-x--- 2 grid oinstall 4096 Oct 31 2009 gipcd
drwxr-x--- 2 root oinstall 4096 Oct 31 2009 gnsd
drwxr-x--- 2 grid oinstall 4096 Oct 31 2009 gpnpd
drwxr-x--- 2 grid oinstall 4096 Oct 31 2009 mdnsd
drwxr-x--- 2 grid oinstall 4096 Oct 31 2009 ohasd
drwxrwxr-t 5 grid oinstall 4096 Oct 31 2009 racg
drwxrwxrwt 2 grid oinstall 4096 Oct 31 2009 racgeut
drwxrwxrwt 2 grid oinstall 4096 Oct 31 2009 racgevtf
drwxrwxrwt 2 grid oinstall 4096 Oct 31 2009 racgmain
drwxr-x--- 2 grid oinstall 4096 Oct 31 2009 srvm


For 12.1.0.2 onward, refer to note 1915729.1 - Oracle Clusterware Diagnostic and Alert Log Moved to ADR

Network Socket File Location, Ownership and Permission


Network socket files can be located in /tmp/.oracle, /var/tmp/.oracle or /usr/tmp/.oracle

When socket file has unexpected ownership or permission, usually daemon log file (i.e. evmd.log) will have the following:

2011-06-18 14:07:28.545: [ COMMCRS][772]clsclisten: Permission denied for (ADDRESS=(PROTOCOL=ipc)(KEY=racnode1DBG_EVMD))

2011-06-18 14:07:28.545: [ clsdmt][515]Fail to listen to (ADDRESS=(PROTOCOL=ipc)(KEY=lexxxDBG_EVMD))
2011-06-18 14:07:28.545: [ clsdmt][515]Terminating process
2011-06-18 14:07:28.559: [ default][515] EVMD exiting on stop request from clsdms_thdmai


And the following error may be reported:

CRS-5017: The resource action "ora.evmd start" encountered the following error:
CRS-2674: Start of 'ora.evmd' on 'racnode1' failed
..


The solution is to stop GI as root (crsctl stop crs -f), clean up socket files and restart GI.


Assuming a Grid Infrastructure environment with node name rac1, CRS owner grid, and clustername eotcs
In Grid Infrastructure cluster environment:

Below is an example output from cluster environment:

drwxrwxrwt 2 root oinstall 4096 Feb 2 21:25 .oracle

./.oracle:
drwxrwxrwt 2 root oinstall 4096 Feb 2 21:25 .
srwxrwx--- 1 grid oinstall 0 Feb 2 18:00 master_diskmon
srwxrwxrwx 1 grid oinstall 0 Feb 2 18:00 mdnsd
-rw-r--r-- 1 grid oinstall 5 Feb 2 18:00 mdnsd.pid
prw-r--r-- 1 root root 0 Feb 2 13:33 npohasd
srwxrwxrwx 1 grid oinstall 0 Feb 2 18:00 ora_gipc_GPNPD_rac1
-rw-r--r-- 1 grid oinstall 0 Feb 2 13:34 ora_gipc_GPNPD_rac1_lock
srwxrwxrwx 1 grid oinstall 0 Feb 2 13:39 s#11724.1
srwxrwxrwx 1 grid oinstall 0 Feb 2 13:39 s#11724.2
srwxrwxrwx 1 grid oinstall 0 Feb 2 13:39 s#11735.1
srwxrwxrwx 1 grid oinstall 0 Feb 2 13:39 s#11735.2
srwxrwxrwx 1 grid oinstall 0 Feb 2 13:45 s#12339.1
srwxrwxrwx 1 grid oinstall 0 Feb 2 13:45 s#12339.2
srwxrwxrwx 1 grid oinstall 0 Feb 2 18:01 s#6275.1
srwxrwxrwx 1 grid oinstall 0 Feb 2 18:01 s#6275.2
srwxrwxrwx 1 grid oinstall 0 Feb 2 18:01 s#6276.1
srwxrwxrwx 1 grid oinstall 0 Feb 2 18:01 s#6276.2
srwxrwxrwx 1 grid oinstall 0 Feb 2 18:01 s#6278.1
srwxrwxrwx 1 grid oinstall 0 Feb 2 18:01 s#6278.2
srwxrwxrwx 1 grid oinstall 0 Feb 2 18:00 sAevm
srwxrwxrwx 1 grid oinstall 0 Feb 2 18:00 sCevm
srwxrwxrwx 1 root root 0 Feb 2 18:01 sCRSD_IPC_SOCKET_11
srwxrwxrwx 1 root root 0 Feb 2 18:01 sCRSD_UI_SOCKET
srwxrwxrwx 1 root root 0 Feb 2 21:25 srac1DBG_CRSD
srwxrwxrwx 1 grid oinstall 0 Feb 2 18:00 srac1DBG_CSSD
srwxrwxrwx 1 root root 0 Feb 2 18:00 srac1DBG_CTSSD
srwxrwxrwx 1 grid oinstall 0 Feb 2 18:00 srac1DBG_EVMD
srwxrwxrwx 1 grid oinstall 0 Feb 2 18:00 srac1DBG_GIPCD
srwxrwxrwx 1 grid oinstall 0 Feb 2 18:00 srac1DBG_GPNPD
srwxrwxrwx 1 grid oinstall 0 Feb 2 18:00 srac1DBG_MDNSD
srwxrwxrwx 1 root root 0 Feb 2 18:00 srac1DBG_OHASD
srwxrwxrwx 1 grid oinstall 0 Feb 2 18:01 sLISTENER
srwxrwxrwx 1 grid oinstall 0 Feb 2 18:01 sLISTENER_SCAN2
srwxrwxrwx 1 grid oinstall 0 Feb 2 18:01 sLISTENER_SCAN3
srwxrwxrwx 1 grid oinstall 0 Feb 2 18:00 sOCSSD_LL_rac1_
srwxrwxrwx 1 grid oinstall 0 Feb 2 18:00 sOCSSD_LL_rac1_eotcs
-rw-r--r-- 1 grid oinstall 0 Feb 2 18:00 sOCSSD_LL_rac1_eotcs_lock
-rw-r--r-- 1 grid oinstall 0 Feb 2 18:00 sOCSSD_LL_rac1__lock
srwxrwxrwx 1 root root 0 Feb 2 18:00 sOHASD_IPC_SOCKET_11
srwxrwxrwx 1 root root 0 Feb 2 18:00 sOHASD_UI_SOCKET
srwxrwxrwx 1 grid oinstall 0 Feb 2 18:00 sOracle_CSS_LclLstnr_eotcs_1
-rw-r--r-- 1 grid oinstall 0 Feb 2 18:00 sOracle_CSS_LclLstnr_eotcs_1_lock
srwxrwxrwx 1 root root 0 Feb 2 18:01 sora_crsqs
srwxrwxrwx 1 root root 0 Feb 2 18:00 sprocr_local_conn_0_PROC
srwxrwxrwx 1 root root 0 Feb 2 18:00 sprocr_local_conn_0_PROL
srwxrwxrwx 1 grid oinstall 0 Feb 2 18:00 sSYSTEM.evm.acceptor.auth

In Oracle Restart environment:

And below is an example output from Oracle Restart environment:

drwxrwxrwt 2 root oinstall 4096 Feb 2 21:25 .oracle

./.oracle:
srwxrwx--- 1 grid oinstall 0 Aug 1 17:23 master_diskmon
prw-r--r-- 1 grid oinstall 0 Oct 31 2009 npohasd
srwxrwxrwx 1 grid oinstall 0 Aug 1 17:23 s#14478.1
srwxrwxrwx 1 grid oinstall 0 Aug 1 17:23 s#14478.2
srwxrwxrwx 1 grid oinstall 0 Jul 14 08:02 s#2266.1
srwxrwxrwx 1 grid oinstall 0 Jul 14 08:02 s#2266.2
srwxrwxrwx 1 grid oinstall 0 Jul 7 10:59 s#2269.1
srwxrwxrwx 1 grid oinstall 0 Jul 7 10:59 s#2269.2
srwxrwxrwx 1 grid oinstall 0 Jul 31 22:10 s#2313.1
srwxrwxrwx 1 grid oinstall 0 Jul 31 22:10 s#2313.2
srwxrwxrwx 1 grid oinstall 0 Jun 29 21:58 s#2851.1
srwxrwxrwx 1 grid oinstall 0 Jun 29 21:58 s#2851.2
srwxrwxrwx 1 grid oinstall 0 Aug 1 17:23 sCRSD_UI_SOCKET
srwxrwxrwx 1 grid oinstall 0 Aug 1 17:23 srac1DBG_CSSD
srwxrwxrwx 1 grid oinstall 0 Aug 1 17:23 srac1DBG_OHASD
srwxrwxrwx 1 grid oinstall 0 Aug 1 17:23 sEXTPROC1521
srwxrwxrwx 1 grid oinstall 0 Aug 1 17:23 sOCSSD_LL_rac1_
srwxrwxrwx 1 grid oinstall 0 Aug 1 17:23 sOCSSD_LL_rac1_localhost
-rw-r--r-- 1 grid oinstall 0 Aug 1 17:23 sOCSSD_LL_rac1_localhost_lock
-rw-r--r-- 1 grid oinstall 0 Aug 1 17:23 sOCSSD_LL_rac1__lock
srwxrwxrwx 1 grid oinstall 0 Aug 1 17:23 sOHASD_IPC_SOCKET_11
srwxrwxrwx 1 grid oinstall 0 Aug 1 17:23 sOHASD_UI_SOCKET
srwxrwxrwx 1 grid oinstall 0 Aug 1 17:23 sgrid_CSS_LclLstnr_localhost_1
-rw-r--r-- 1 grid oinstall 0 Aug 1 17:23 sgrid_CSS_LclLstnr_localhost_1_lock
srwxrwxrwx 1 grid oinstall 0 Aug 1 17:23 sprocr_local_conn_0_PROL



Diagnostic file collection


If the issue can't be identified with the note, as root, please run $GRID_HOME/bin/diagcollection.sh on all nodes, and upload all .gz files it generated in current directory
f