Clusterware Startup issue
clusertware
start up issue =node startup issue
1) verify
clusterware status
root@node1 ~]#
/u02/oracle/12.1.0.2/grid/bin/crsctl stat res -t
CRS-4535: Cannot
communicate with Cluster Ready Services
CRS-4000: Command
Status failed, or completed with errors.
[root@node1 ~]#
[root@node1 ~]#
/u02/oracle/12.1.0.2/grid/bin/crsctl stat res -t -init
[root@node1 ~]# ps
-ef|grep d.bin
root 5484 1 29 05:24
? 00:55:50 /u02/oracle/12.1.0.2/grid/bin/ohasd.bin reboot
root 6020 1 0 05:24
? 00:00:34 /u02/oracle/12.1.0.2/grid/bin/orarootagent.bin
oracrs 6116 1 0
05:24 ? 00:00:33 /u02/oracle/12.1.0.2/grid/bin/oraagent.bin
oracrs 6128 1 0
05:24 ? 00:00:29 /u02/oracle/12.1.0.2/grid/bin/mdnsd.bin
oracrs 6132 1 2
05:24 ? 00:04:58 /u02/oracle/12.1.0.2/grid/bin/evmd.bin
oracrs 6163 1 0
05:24 ? 00:00:31 /u02/oracle/12.1.0.2/grid/bin/gpnpd.bin
oracrs 6180 6132 1
05:24 ? 00:03:33 /u02/oracle/12.1.0.2/grid/bin/evmlogger.bin -o
/u02/oracle/12.1.0.2/grid/log/[HOSTNAME]/evmd/evmlogger.info -l
/u02/oracle/12.1.0.2/grid/log/[HOSTNAME]/evmd/evmlogger.log
oracrs 6255 1 47
05:24 ? 01:27:47 /u02/oracle/12.1.0.2/grid/bin/gipcd.bin
root 20116 1 0 05:27
? 00:00:39 /u02/oracle/12.1.0.2/grid/bin/cssdmonitor
root 116579 105692 0
08:31 pts/1 00:00:00 grep d.bin
1. check complete
ASM alertlog ( do not cut ) .
2. verify
generated html by executing script1 using the MOS Note : Doc ID
470211.1
3. ls -lL
/dev/oracleasm/disks/*
5. $ kfod disks=all
>>>>>>>>>>>>>> Execute this
as Grid Owner.
6. # ls -l
<GridHome>/bin/oracle
7. # ls -l
<RDBMSHome>/bin/oracle
8. # crsctl stat res
-t -init
9. # ps -ef|grep
d.bin
[root@node1 tmp]# ls
-lL /dev/oracleasm/disks/*
brw-rw---- 1 oracrs
asmadmin 202, 65 Oct 1 05:24 /dev/oracleasm/disks/ACFS_DATA1
brw-rw---- 1 oracrs
asmadmin 202, 81 Oct 1 05:24 /dev/oracleasm/disks/ACFS_DATA2
brw-rw---- 1 oracrs
asmadmin 202, 97 Oct 1 05:24 /dev/oracleasm/disks/ACFS_DATA3
brw-rw---- 1 oracrs
asmadmin 202, 113 Oct 1 05:24 /dev/oracleasm/disks/OCR_VOTE01
details commands
SPOOL
ASM<#>_GENERIC_ASM_METADATA.html
-- ASM VERSIONS
10.1, 10.2, 11.1, 11.2, 12.1 & 12.2
SET MARKUP HTML ON
SET ECHO ON
SET PAGESIZE 200
ALTER SESSION SET
NLS_DATE_FORMAT='DD-MON-YYYY HH24:MI:SS';
SELECT 'THIS ASM
REPORT WAS GENERATED AT: ==)> ' , SYSDATE " " FROM DUAL;
SELECT 'INSTANCE
NAME: ==)> ' , INSTANCE_NAME " " FROM V$INSTANCE;
SELECT 'HOSTNAME
ASSOCIATED WITH THIS ASM INSTANCE: ==)> ' , MACHINE " "
FROM V$SESSION WHERE PROGRAM LIKE '%SMON%';
SELECT * FROM
V$INSTANCE;
SELECT * FROM
GV$INSTANCE;
SELECT * FROM
V$ASM_DISKGROUP;
SELECT GROUP_NUMBER,
DISK_NUMBER, MOUNT_STATUS, HEADER_STATUS, MODE_STATUS, STATE, OS_MB,
TOTAL_MB, FREE_MB, NAME, FAILGROUP, PATH
FROM V$ASM_DISK
ORDER BY GROUP_NUMBER, FAILGROUP, DISK_NUMBER;
SELECT * FROM
V$ASM_DISK ORDER BY GROUP_NUMBER,DISK_NUMBER;
SELECT
SUBSTR(D.NAME,1,16) AS ASMDISK, D.MOUNT_STATUS, D.STATE,
DG.NAME AS DISKGROUP
FROM V$ASM_DISKGROUP DG, V$ASM_DISK D
WHERE
DG.GROUP_NUMBER = D.GROUP_NUMBER;
SELECT * FROM
V$ASM_CLIENT;
SELECT DG.NAME AS
DISKGROUP, SUBSTR(C.INSTANCE_NAME,1,12) AS INSTANCE,
SUBSTR(C.DB_NAME,1,12)
AS DBNAME, SUBSTR(C.SOFTWARE_VERSION,1,12) AS SOFTWARE,
SUBSTR(C.COMPATIBLE_VERSION,1,12)
AS COMPATIBLE
FROM V$ASM_DISKGROUP
DG, V$ASM_CLIENT C
WHERE
DG.GROUP_NUMBER = C.GROUP_NUMBER;
SELECT * FROM
V$ASM_ATTRIBUTE;
SELECT * FROM
V$ASM_OPERATION;
SELECT * FROM
GV$ASM_OPERATION;
SELECT * FROM
V$VERSION;
SELECT * FROM
V$ASM_ACFSSNAPSHOTS;
SELECT * FROM
V$ASM_ACFSVOLUMES;
SELECT * FROM
V$ASM_FILESYSTEM;
SELECT * FROM
V$ASM_VOLUME;
SELECT * FROM
V$ASM_VOLUME_STAT;
SELECT * FROM
V$ASM_USER;
SELECT * FROM
V$ASM_USERGROUP;
SELECT * FROM
V$ASM_USERGROUP_MEMBER;
SELECT * FROM
V$ASM_DISK_IOSTAT;
SELECT * FROM
V$ASM_DISK_STAT;
SELECT * FROM
V$ASM_DISKGROUP_STAT;
SELECT * FROM
V$ASM_TEMPLATE;
SHOW PARAMETER
SHOW SGA
!echo "SELECT
'" > /tmp/GPNPTOOL.SQL 2> /dev/null
!
$ORACLE_HOME/bin/gpnptool get >> /tmp/GPNPTOOL.SQL 2>>
/dev/null
!echo "' FROM
DUAL;" >> /tmp/GPNPTOOL.SQL 2>> /dev/null
! cat
/tmp/GPNPTOOL.SQL
SET ECHO OFF
--DISPLAYS
INFORMATION ABOUT THE CONTENTS OF THE SPFILE.
SELECT * FROM
V$SPPARAMETER ORDER BY 2;
SELECT * FROM
GV$SPPARAMETER ORDER BY 3;
--DISPLAYS
INFORMATION ABOUT THE INITIALIZATION PARAMETERS THAT ARE CURRENTLY IN
EFFECT IN THE INSTANCE.
SELECT * FROM
V$SYSTEM_PARAMETER ORDER BY 2;
SELECT * FROM
GV$SYSTEM_PARAMETER ORDER BY 3;
-- 12C ACFS VIEWS
SELECT * FROM
V$ASM_ACFS_ENCRYPTION_INFO;
SELECT * FROM
V$ASM_ACFSREPL;
SELECT * FROM
V$ASM_ACFSREPLTAG;
SELECT * FROM
V$ASM_ACFS_SEC_ADMIN;
SELECT * FROM
V$ASM_ACFS_SEC_CMDRULE;
SELECT * FROM
V$ASM_ACFS_SEC_REALM;
SELECT * FROM
V$ASM_ACFS_SEC_REALM_FILTER;
SELECT * FROM
V$ASM_ACFS_SEC_REALM_GROUP;
SELECT * FROM
V$ASM_ACFS_SEC_REALM_USER;
SELECT * FROM
V$ASM_ACFS_SEC_RULE;
SELECT * FROM
V$ASM_ACFS_SEC_RULESET;
SELECT * FROM
V$ASM_ACFS_SEC_RULESET_RULE;
SELECT * FROM
V$ASM_ACFS_SECURITY_INFO;
SELECT * FROM
V$ASM_ACFSTAG;
-- 12C ASM AUDIT
VIEWS
SELECT * FROM
V$ASM_AUDIT_CLEAN_EVENTS;
SELECT * FROM
V$ASM_AUDIT_CLEANUP_JOBS;
SELECT * FROM
V$ASM_AUDIT_CONFIG_PARAMS;
SELECT * FROM
V$ASM_AUDIT_LAST_ARCH_TS;
-- 12C ASM ESTIMATE
VIEW
SELECT * FROM
V$ASM_ESTIMATE;
SELECT * FROM
GV$ASM_ESTIMATE;
-- SPARSE Diskgroups
VIEW
SELECT * FROM
V$ASM_DISK_SPARSE;
SELECT * FROM
V$ASM_DISKGROUP_SPARSE;
SPOOL OFF
EXIT
pool
asm<#>_alias+files.html
-- ASM VERSIONS
10.1, 10.2, 11.1, 11.2, 12.1 & 12.2
SET MARKUP HTML ON
set echo on
set pagesize 200
COLUMN BYTES FORMAT
9999999999999999
alter session set
nls_date_format='DD-MON-YYYY HH24:MI:SS';
select 'THIS ASM
REPORT WAS GENERATED AT: ==)> ' , sysdate " " from
dual;
select 'HOSTNAME
ASSOCIATED WITH THIS ASM INSTANCE: ==)> ' , MACHINE " "
from v$session where program like '%SMON%';
select * from
v$asm_alias;
select * from
v$asm_file;
show parameter asm
show parameter
cluster
show parameter
instance_type
show parameter
instance_name
show parameter
spfile
show sga
spool off
that is one possible workaround to delete the socket files.
Please execute the
following action plan
1. Stop crs on issue
Node
crsctl stop crs
if not stopping
crsctl stop crs -f
Make sure all the GI
stack should be down. 'ps -ef | grep d.bin' (kill any processes found
using os kill command)
$GRID_ORACLE_HOME/bin/crsctl
status resource -t
2. Remove the socket
files on issue node
please dont delete
folder ,Please deleted socket file inside folder
root@node1 bin]# ls
-ltra /var/tmp/.oracle
rm -rf
/var/tmp/.oracle/*
rm -rf
/tmp/.oracle/*
root@node1 bin]# ls
-ltra /var/tmp/.oracle
96 -rw-r--r-- 1
fusadm dba 17 Sep 20 09:40 test_email
[root@node1 tmp]# ls
-la
total 2344
drwxrwxrwt. 5 root
root 4096 Oct 2 14:00 .
drwxr-xr-x. 22 root
root 4096 Feb 27 2018 ..
-rw-r--r-- 1
fusadm dba 1976 Oct 1 03:45 apache_alert
-rw-r--r-- 1
fusadm dba 77 Oct 2 14:00 apache_alert.sh
-rw-r--r-- 1
fusadm dba 1976 Oct 1 03:45 apache_master
-rw-r--r-- 1
fusadm dba 2095832 Oct 1 00:39 cloud.txt
-rw-r--r-- 1
fusadm dba 3015 Sep 17 07:36 fil
-rw-r--r-- 1
fusadm dba 153 Sep 17 07:36 fil1
-rw-r--r-- 1
fusadm dba 153 Sep 17 07:36 fil1.srt
-rw------- 1 root
root 1853 Jun 19 15:07 host_0
drwx------. 2 root
root 16384 Oct 10 2017 lost+found
-rw-r--r-- 1 root
root 157119 Aug 4 03:48 lpar2rrd-agent-10.160.36.212-root.err
-rw-r--r-- 1 root
root 31647 Oct 2 14:00
lpar2rrd-agent-10.160.36.212-root-ps_job.txt
-rw-r--r-- 1 root
root 0 Oct 2 13:49 lpar2rrd-agent-10.160.36.212-root.stamp
-rw-r--r-- 1 root
root 10 Oct 2 13:49
lpar2rrd-agent-10.160.36.212-root.stamp-send
-rw-r--r-- 1 root
root 0 Oct 2 07:57
lpar2rrd-agent-10.160.36.212-root.stamp-trimlogs
-rw-r--r-- 1 root
root 4544 Oct 2 14:00 lpar2rrd-agent-10.160.36.212-root.txt
-rw-r--r-- 1 root
root 4130 Oct 2 13:49 lpar2rrd-agent-10.160.36.212-root.txtorig
-rw-r--r-- 1 root
root 0 Oct 2 13:49 lpar2rrd-agent-10.160.36.212-root.txt-tmp
-rw-r--r-- 1 root
root 1761 Oct 2 14:00 lpar2rrd-agent.out
-rw-r--r-- 1
fusadm dba 2 Sep 13 11:39 mark
drwxrwxrwt 2 root
dba 12288 Oct 2 01:43 oldoracle
drwxrwxrwt 2 root
dba 4096 Oct 2 12:47 .oracle
-rw-r--r-- 1
fusadm dba 17 Sep 20 09:40 test_email
[root@node1 tmp]# cd
.oracle
[root@node1
.oracle]# ls
npohasd
ora_gipc_node1_DBG_OHASD ora_gipc_node1_DBG_OHASD_lock
sOHASD_IPC_SOCKET_11 sOHASD_IPC_SOCKET_11_lock sOHASD_UI_SOCKET
sOHASD_UI_SOCKET_lock sprocr_local_conn_0_PROL
sprocr_local_conn_0_PROL_lock
[root@node1
.oracle]# ls -ltr
total 0
prw-r--r-- 1 root
root 0 Oct 2 12:47 npohasd
-rw-r--r-- 1 root
root 0 Oct 2 12:47 sprocr_local_conn_0_PROL_lock
srwxrwxrwx 1 root
root 0 Oct 2 12:47 sprocr_local_conn_0_PROL
-rw-r--r-- 1 root
root 0 Oct 2 12:47 ora_gipc_node1_DBG_OHASD_lock
srwxrwxrwx 1 root
root 0 Oct 2 12:47 ora_gipc_node1_DBG_OHASD
-rw-r--r-- 1 root
root 0 Oct 2 12:47 sOHASD_IPC_SOCKET_11_lock
srwxrwxrwx 1 root
root 0 Oct 2 12:47 sOHASD_IPC_SOCKET_11
-rw-r--r-- 1 root
root 0 Oct 2 12:47 sOHASD_UI_SOCKET_lock
srwxrwxrwx 1 root
root 0 Oct 2 12:47 sOHASD_UI_SOCKET
[root@node1
.oracle]# rm *
rm: remove fifo
`npohasd'? yes
rm: remove socket
`ora_gipc_node1_DBG_OHASD'? yes
rm: remove regular
empty file `ora_gipc_node1_DBG_OHASD_lock'? yes
rm: remove socket
`sOHASD_IPC_SOCKET_11'? yes
rm: remove regular
empty file `sOHASD_IPC_SOCKET_11_lock'? yes
rm: remove socket
`sOHASD_UI_SOCKET'? yes
rm: remove regular
empty file `sOHASD_UI_SOCKET_lock'? yes
rm: remove socket
`sprocr_local_conn_0_PROL'? yes
rm: remove regular
empty file `sprocr_local_conn_0_PROL_lock'? yes
[root@node1
.oracle]# ls
[root@node1
.oracle]# cat /oe
[root@node1
.oracle]# cat /etc/oratab
#Backup file is
/u02/oracle/12.1.0.2/grid/srvm/admin/oratab.bak.node1 line added by
Agent
#
3. Reboot node
4. crsctl stat res
-t -init
or
if permission is not correct for .oracle folder for faulty node . Please check permission from serviving node and change same permission as below
chmod 1777
/var/tmp/.oracle
chown root:dba
/var/tmp/.oracle
to bring that
directory on node1 into line with node2 & node3
[root@node1 tmp]# ls
-la
total 2352
drwxrwxrwt. 5 root
root 4096 Oct 2 12:46 .
drwxr-xr-x. 22 root
root 4096 Feb 27 2018 ..
-rw-r--r-- 1
fusadm dba 1976 Oct 1 03:45 apache_alert
-rw-r--r-- 1
fusadm dba 77 Oct 2 12:45 apache_alert.sh
-rw-r--r-- 1
fusadm dba 1976 Oct 1 03:45 apache_master
-rw-r--r-- 1
fusadm dba 2095832 Oct 1 00:39 cloud.txt
-rw-r--r-- 1
fusadm dba 3015 Sep 17 07:36 fil
-rw-r--r-- 1
fusadm dba 153 Sep 17 07:36 fil1
-rw-r--r-- 1
fusadm dba 153 Sep 17 07:36 fil1.srt
-rw------- 1 root
root 1853 Jun 19 15:07 host_0
drwx------. 2 root
root 16384 Oct 10 2017 lost+found
-rw-r--r-- 1 root
root 157119 Aug 4 03:48 lpar2rrd-agent-10.160.36.212-root.err
-rw-r--r-- 1 root
root 32661 Oct 2 12:30
lpar2rrd-agent-10.160.36.212-root-ps_job.txt
-rw-r--r-- 1 root
root 0 Oct 2 12:30 lpar2rrd-agent-10.160.36.212-root.stamp
-rw-r--r-- 1 root
root 10 Oct 2 12:30
lpar2rrd-agent-10.160.36.212-root.stamp-send
-rw-r--r-- 1 root
root 0 Oct 2 07:57
lpar2rrd-agent-10.160.36.212-root.stamp-trimlogs
-rw-r--r-- 1 root
root 11383 Oct 2 12:46 lpar2rrd-agent-10.160.36.212-root.txt
-rw-r--r-- 1 root
root 8691 Oct 2 12:30 lpar2rrd-agent-10.160.36.212-root.txtorig
-rw-r--r-- 1 root
root 0 Oct 2 12:30 lpar2rrd-agent-10.160.36.212-root.txt-tmp
-rw-r--r-- 1 root
root 1761 Oct 2 12:46 lpar2rrd-agent.out
-rw-r--r-- 1
fusadm dba 2 Sep 13 11:39 mark
drwxrwxrwt 2 root
dba 12288 Oct 2 01:43 oldoracle
drwxr-xr-x 2 root
root 4096 Oct 2 12:46 .oracle
-rw-r--r-- 1
fusadm dba 17 Sep 20 09:40 test_email
[root@node1 tmp]# cd
.oracle
[root@node1
.oracle]# ls -ltr
[root@node1 bin]#
pwd
/u02/oracle/12.1.0.2/grid/bin
[root@node1 bin]# ls
-ld /u02/oracle/12.1.0.2/grid/mdns/init
drwxr-x--- 2 oracrs
dba 4096 Oct 11 2017 /u02/oracle/12.1.0.2/grid/mdns/init
[root@node1 bin]# cd
/u02/oracle/12.1.0.2/grid/mdns/init
[root@node1 bin]# ls
-ld /u02/oracle/12.1.0.2/grid/mdns/init/node1
-rw-r--r-- 1 oracrs
dba 0 Oct 11 2017 /u02/oracle/12.1.0.2/grid/mdns/init/node1
[root@node1 bin]#
or
Aware of network
socket files before you cleanup in /var/tmp, /tmp, /usr/tmp, it's
critical for oracle clusterware to run
ORA-29701: unable to
connect to Cluster Synchronization Service
Unable To Connect To
Cluster Manager as Network Socket Files are Removed
Purpose
This note explains
relevant issues if Oracle Clusterware's network socket files are
deleted or wrongly owned.
Details
Oracle
Clusterware(CRS or Grid Infrastructure) network socket files are
located in /tmp/.oracle, /usr/tmp/.oracle or /var/tmp/.oracle, it's
important not to touch them manually unless instructed by Oracle
Support to keep clusterware healthy.
Cause
The hidden directory
'/var/tmp/.oracle' (or /tmp/.oracle on some platforms) or its
content was removed while instances & the CRS stack were up and
running. Typically this directory contains a number of "special"
socket files that are used by local clients to connect via the IPC
protocol (sqlnet) to various Oracle processes including the TNS
listener, the CSS, CRS & EVM daemons or even database or ASM
instances. These files are created when the "listening"
process starts.
A typical listing of
the '/var/tmp/.oracle' shows a number of such files:
# cd
/var/tmp/.oracle
# ls -l
srwxrwxrwx 1 oracle
dba 0 Sep 6 10:50 s#9862.2
srwxrwxrwx 1 oracle
dba 0 Sep 15 11:35 sAracnode1_crs_evm
srwxrwxrwx 1 root
root 0 Sep 15 11:35 sracnode1DBG_CRSD
srwxrwxrwx 1 oracle
dba 0 Sep 15 11:34 sracnode1DBG_CSSD
srwxrwxrwx 1 oracle
dba 0 Sep 15 11:35 sracnode1DBG_EVMD
srwxrwxrwx 1 oracle
dba 0 Sep 15 11:35 sCracnode1_crs_evm
srwxrwxrwx 1 root
root 0 Sep 15 11:35 sCRSD_UI_SOCKET
srwxrwxrwx 1 oracle
dba 0 Sep 15 11:35 sEXTPROC
srwxrwxrwx 1 oracle
dba 0 Sep 15 11:34 sOCSSD_LL_racnode1_crs
srwxrwxrwx 1 oracle
dba 0 Sep 15 11:34 sOracle_CSS_LclLstnr_crs_1
srwxrwxrwx 1 root
root 0 Sep 15 11:35 sora_crsqs
srwxrwxrwx 1 root
root 0 Sep 15 11:35 sprocr_local_conn_0_PROC
srwxrwxrwx 1 oracle
dba 0 Sep 15 11:35 sSYSTEM.evm.acceptor.auth
When a file is
deleted on Unix, it becomes "invisible" at the filesystem
level, however any process which had the file opened when it was
deleted will still be able to use it.
Attempts to open a
"deleted" file for reading will fail (ENOENT 2 /* No such
file or directory */) , opening a file with the same name for writing
will create a new (different) file.
Therefore only
processes that attempted to open the socket file during the initial
handshake were failing with ORA-29701 while existing processes were
unaffected.
A very common cause
for this issue are system administration activities that involve
freeing up space in /tmp, /var/tmp etc - either run occasionally or
regularly via cronjobs. As a rule of thumb the directory .oracle in
/var/tmp or /tmp should always be excluded from such activities. The
best time to completely clean out these directories would be during
system boot - before the clusterware is started.
Solution
The only way to
re-create these special files is to restart (instance, listener,
CRS). In a RAC environment this requires the shutdown & restart
of the entire CRS stack.
As these special
files are required to communicate with the various CRS daemons, it
most likely will not be possible to stop (and restart) the CRS stack
using the following commands as user root - but it won't hurt to try
it anyway:
11g:
#
$ORA_CRS_HOME/bin/crsctl stop crs
#
$ORA_CRS_HOME/bin/crsctl start crs
If the above fails
to successfully stop the CRS stack, a system reboot will be
inevitable.
As for deleting
files from temporary directory via a cronjob (or otherwise):
the directory
'/var/tmp/.oracle' (on some platform /tmp/.oracle) should be excluded
from such jobs/tasks. The files in this directory occupy only a few
bytes and generally do not need to be cleaned up.
Please note that the
location of the .oracle directory is not configurable, so the only
way to avoid such issues is to make sure it is not deleted while the
clusterware is up & running.
If the specified
temp location must be cleaned to release space, consider to delete
files which meet both criterias:
or
2.1.0.2 Grid
Infrastructure Stack not Start due to .pid file issues (Doc ID
2028511.1)
Applies to:
Oracle Database -
Enterprise Edition - Version 12.1.0.2 and later
Information in this
document applies to any platform.
Symptoms
On a 12.1.0.2 multi
node cluster, when starting the clusterware, GI stack fails to start,
this could happen to any of the resource managed by OHASD eg:
ora.mdnsd (mdnsd.bin), ora.gipcd (gipcd.bin), ora.gpnpd (gpnpd.bin),
ora.evmd (evmd.bin) etc.
Case I. No messages
is written to the corresponding <process>.trc file. The node
alert log (<GI home>/log/<node>/alert<node>.log)
shows the following messages:
2015-07-07
14:43:22.594 [ORAAGENT(4642)]CRS-5818: Aborted command 'start' for
resource 'ora.gipcd'. Details at (:CRSAGF00113:) {0:0:2} in
/u01/app/grid/diag/crs/racnode1/crs/trace/ohasd_oraagent_grid.trc.
2015-07-07
14:43:23.496 [ORAAGENT(4642)]CRS-5017: The resource action "ora.gipcd
start" encountered the following error:
2015-07-07
14:43:23.496+Start action for daemon aborted. For details refer to
"(:CLSN00107:)" in
"/u01/app/grid/diag/crs/racnode1/crs/trace/ohasd_oraagent_grid.trc".
2015-07-07
14:43:26.803 [OHASD(4253)]CRS-2757: Command 'Start' timed out waiting
for response from the resource 'ora.gipcd'. Details at (:CRSPE00163:)
{0:0:2} in /u01/app/grid/diag/crs/racnode1/crs/trace/ohasd.trc.
Case II.
<ORACLE_BASE>/diag/crs/<host>/crs/trace/gipcd.trc shows:
2015-10-26
11:58:05.901092 : CLSDMT:180586240: PID for the Process [5395],
connkey 13
2015-10-26
11:58:05.901470 : CLSDMT:180586240: Failed to record pid for GIPCD
2015-10-26
11:58:05.901474 : CLSDMT:180586240: Terminating process
...
trace file
/home/grid/oraclebase/diag/crs/racnode1/crs/trace/gipcd.trc
Oracle Database 12c
Clusterware Release 12.1.0.2.0 - Production Copyright 1996, 2014
Oracle. All rights reserved.
DDE: Flood control
is not active
2015-10-26
11:59:05.909217 : GIPCD:180586240: gipcd_ExitCB: one or more of
gipcdThreads failed to come into offline in 60 seconds of time,
aborting the gipcd process
CLSB:180586240:
Oracle Clusterware infrastructure error in GIPCD (OS PID 5395): Fatal
signal 6 has occurred in program gipcd thread 180586240; nested
signal count is 1
Incident 33 created,
dump file:
/home/grid/oraclebase/diag/crs/racnode1/crs/incident/incdir_33/gipcd_i33.trc
CRS-8503 [] [] [] []
[] [] [] [] [] [] [] []
Case III. crsctl
start crs shows:
Start action for
daemon aborted. For details refer to "(:CLSN00107:)" in
"D:\app\grid\racdbauser\diag\crs\racnode1\crs\trace\ohasd_oraagent_system.trc".
CRS-2674: Start of
'ora.mdnsd' on 'racnode1' failed
CRS-2679: Attempting
to clean 'ora.mdnsd' on 'racnode1'
CRS-2681: Clean of
'ora.mdnsd' on 'racnode1' succeeded
CRS-2672: Attempting
to start 'ora.gpnpd' on 'racnode1'
CRS-5017: The
resource action "ora.gpnpd start" encountered the following
error:
Start action for
daemon aborted. For details refer to "(:CLSN00107:)" in
"D:\app\grid\racdbauser\diag\crs\racnode1\crs\trace\ohasd_oraagent_system.trc".
CRS-2883: Resource
'ora.gpnpd' failed during Clusterware stack start.
CRS-4406: Oracle
High Availability Services synchronous start failed.
CRS-4000: Command
Start failed, or completed with errors.
2015/11/30 18:32:21
CLSRSC-117: Failed to start Oracle Clusterware stack
Cause
The issues are
caused by either the wrong ownership of the *.pid files or *OUT.trc
files corresponding to the resources or the corresponding *.pid file
missing.
Starting with Grid
Infrastructure release 12.1.0.2, the pid file for each daemon process
not only exists under <GRID_HOME>/<resource>/<host>.pid,
but also exists under
<ORACLE_BASE>/crsdata/<hostname>/output/<resource>.pid.
Case I. it is caused
by wrong ownership for the *.pid file under
<ORACLE_BASE>/crsdata/<hostname>/output/<resource>.pid.
-rw-r--r-- 1 root
root 4943 Jun 18 07:20
/u01/app/grid/crsdata/racnode1/output/gipcdOUT.trc
-rw-r--r-- 1 root
root 5 Jun 18 07:20
/u01/app/grid/crsdata/racnode1/output/gipcd.pid
Files are owned by
root, and the <grid> user has no write privilege on them.
If the file
permission of the <resource>.pid is not writable by the <grid>
user, then the process will fail to start. This includes the
gipcd.bin, gpnpd.bin, mdns.bin, evmd.bin etc.
When this happen, a
file named /tmp/<resource>_<pid>.out is generated, for
example: /tmp/gipcd_32055.out, with similar content like:
Oracle Clusterware
infrastructure error in GIPCD (OS PID 32055): Error in an
OS-dependent function or service
Error category: -2,
operation: open, location: SCLSB00009, OS error: 13
OS error message:
Permission denied
Additional
information: Call to open daemon stdout/stderr file failed
Oracle Clusterware
infrastructure fatal error in GIPCD (OS PID 32055): Internal error
(ID (:CLSB00126:)) - Failed to redirect daemon standard outputs using
location /u01/app/grid/crsdata/racnode1/output and root name gipcd
This file shows
the exact path / location of files that need to be checked.
Case II. It is
caused by .pid file under GRID_HOME/gipc/init/<host> or
GRID_HOME/gipc/init/<host>.pid files not presented/not writable
Case III. It is
caused by *.pid file missing from both locations due to manual
removal of these files by accident
Solution
Case I. Change the
ownership of the files to be owned by <grid> user, eg: as root
user:
# cd
/u01/app/grid/crsdata/racnode1/output
# chown
grid:oinstall gipcd*
Make similar changes
to other PID files if needed.
The clusterware
should start up automatically post the change.
Case II. Touch two
files and set the correct ownership and permission, eg:
as grid user:
touch
<GRID_HOME>/gipc/init/racnode1
touch
<GRID_HOME>/gipc/init/racnode1.pid
chmod 644
<GRID_HOME>/gipc/init/*
Restart the CRS
stack.
Case III. Recreate
an empty *.pid file for the missing pid file, set with correct
ownership and permission.
As a reference, here
are the *.pid files exist under
<ORACLE_BASE>/crsdata/<hostname>/output/:
-rw-r--r--. 1 oracle
oinstall 5 Dec 1 08:36 crsd_oraagent_oracle.pid
-rw-r--r--. 1 grid
oinstall 5 Dec 1 08:35 crsd_oraagent_grid.pid
-rw-r--r--. 1 root
root 5 Dec 1 08:36 crsd_orarootagent_root.pid
-rw-r--r--. 1 root
root 5 Dec 1 08:35 crsd.pid
-rw-r--r--. 1 grid
oinstall 5 Dec 1 08:36 crsd_scriptagent_grid.pid
-rw-r--r--. 1 grid
oinstall 5 Dec 1 08:34 evmd.pid
-rw-r--r--. 1 grid
oinstall 5 Dec 1 08:34 evmlogger.pid
-rw-r--r--. 1 grid
oinstall 5 Dec 1 08:34 gipcd.pid
-rw-r--r--. 1 grid
oinstall 5 Dec 1 08:34 gpnpd.pid
-rw-r--r--. 1 grid
oinstall 5 Dec 1 08:34 mdnsd.pid
-rw-r--r--. 1 grid
oinstall 5 Dec 1 08:34 ocssd.pid
-rw-r--r--. 1 root
root 5 Dec 1 08:35 octssd.pid
-rw-r--r--. 1 root
root 5 Dec 1 08:34 ohasd_cssdagent_root.pid
-rw-r--r--. 1 root
root 5 Dec 1 08:34 ohasd_cssdmonitor_root.pid
-rw-r--r--. 1 grid
oinstall 5 Dec 1 08:34 ohasd_oraagent_grid.pid
-rw-r--r--. 1 root
root 5 Dec 1 08:35 ohasd_orarootagent_root.pid
-rw-r--r--. 1 root
root 5 Dec 1 08:34 ohasd.pid
-rw-r--r--. 1 root
root 5 Dec 1 08:35 ologgerd.pid
-rw-r--r--. 1 root
root 5 Dec 1 08:35 osysmond.pid
Here are the pid
files exist under GRID_HOME:
-rw-r--r--. 1 root
root 0 Jul 29 14:52 ./crs/init/lccn0
-rw-r--r--. 1 root
root 5 Dec 1 08:35 ./crs/init/lccn0.pid
-rw-r--r--. 1 root
root 0 Jul 29 14:51 ./ctss/init/lccn0
-rw-r--r--. 1 root
root 5 Dec 1 08:35 ./ctss/init/lccn0.pid
-rw-r--r--. 1 grid
oinstall 0 Jul 29 14:50 ./evm/init/lccn0
-rw-r--r--. 1 grid
oinstall 5 Dec 1 08:34 ./evm/init/lccn0.pid
-rw-r--r--. 1 grid
oinstall 5 Dec 1 08:34 ./gipc/init/lccn0
-rw-r--r--. 1 grid
oinstall 5 Dec 1 08:34 ./gipc/init/lccn0.pid
-rw-r--r--. 1 grid
oinstall 0 Jul 29 14:50 ./gpnp/init/lccn0
-rw-r--r--. 1 grid
oinstall 5 Dec 1 08:34 ./gpnp/init/lccn0.pid
-rw-r--r--. 1 grid
oinstall 0 Jul 29 14:50 ./mdns/init/lccn0
-rw-r--r--. 1 grid
oinstall 5 Dec 1 08:34 ./mdns/init/lccn0.pid
-rw-r--r--. 1 root
root 0 Jul 29 14:50 ./ohasd/init/lccn0
-rw-r--r--. 1 root
root 5 Dec 1 08:34 ./ohasd/init/lccn0.pid
-rw-r--r--. 1 root
root 0 Jul 29 14:54 ./ologgerd/init/lccn0
-rw-r--r--. 1 root
root 5 Dec 1 08:35 ./ologgerd/init/lccn0.pid
-rw-r--r--. 1 root
root 0 Jul 29 14:52 ./osysmond/init/lccn0
-rw-r--r--. 1 root
root 5 Dec 1 08:35 ./osysmond/init/lccn0.pid
For windows
platform, the corresponding pid files should exist under
<ORACLE_BASE>\Administrator\crsdata\<host>\output\:
crsd.pid
crsd_oraagent_system.pid
crsd_oraagent_system.pid
crsd_scriptagent_system.pid
crsd_orarootagent_system.pid
evmlogger.pid
evmd.pid
gpnpd.pid
gipcd.pid
mdnsd.pid
ocssd.pid
octssd.pid
ohasd.pid
ohasd_cssdagent_system.pid
ohasd_cssdmonitor_system.pid
ohasd_orarootagent_system.pid
ohasd_oraagent_system.pid
ologgerd.pid
osysmond.pid
<host> and
<host>.pid files exist under <GRID_HOME>:
./crs/init/
./evm/init/
./gpnp/init/
./mdns/init/
./gipc/init/
./ohasd/init/
./ctss/init/
./osysmond/init/
./ologgerd/init/
Restart the CRS
stack after above.
==========================
Top 5 Grid
Infrastructure Startup Issues (Doc ID 1368382.1)
Applies to:
Oracle Database -
Enterprise Edition - Version 11.2.0.1 to 11.2.0.4 [Release 11.2]
Oracle Database
Cloud Schema Service - Version N/A and later
Oracle Database
Exadata Express Cloud Service - Version N/A and later
Oracle Database
Exadata Cloud Machine - Version N/A and later
Oracle Cloud
Infrastructure - Database Service - Version N/A and later
Information in this
document applies to any platform.
Purpose
The purpose of this
note is to provide a summary of the top 5 issues that may prevent the
successful startup of the Grid Infrastructure (GI) stack.
Scope
This note applies to
11gR2 Grid Infrastructure only.
To determine the
status of GI, please run the following commands:
1.
$GRID_HOME/bin/crsctl check crs
2.
$GRID_HOME/bin/crsctl stat res -t -init
3.
$GRID_HOME/bin/crsctl stat res -t
4. ps -ef | egrep
'init|d.bin'
Details
Issue #1: CRS-4639:
Could not contact Oracle High Availability Services, ohasd.bin not
running or ohasd.bin is running but no init.ohasd or other processes
Symptoms:
1. Command
'$GRID_HOME/bin/crsctl check crs' returns error:
CRS-4639: Could
not contact Oracle High Availability Services
2. Command 'ps -ef |
grep init' does not show a line similar to:
root 4878 1 0
Sep12 ? 00:00:02 /bin/sh /etc/init.d/init.ohasd run
3. Command 'ps -ef |
grep d.bin' does not show a line similar to:
root 21350 1 6
22:24 ? 00:00:01 /u01/app/11.2.0/grid/bin/ohasd.bin reboot
Or it may only
show "ohasd.bin reboot" process without any other processes
4. ohasd.log report:
2013-11-04
09:09:15.541: [ default][2609911536] Created alert : (:OHAS00117:) :
TIMED OUT WAITING FOR OHASD MONITOR
5. ohasOUT.log
report:
2013-11-04
08:59:14
Changing
directory to /u01/app/11.2.0/grid/log/lc1n1/ohasd
OHASD
starting
Timed out
waiting for init.ohasd script to start; posting an alert
6. ohasd.bin keeps
restarting, ohasd.log report:
2014-08-31
15:00:25.132: [ CRSSEC][733177600]{0:0:2} Exception:
PrimaryGroupEntry constructor failed to validate group name with
error: 0 groupId: 0x7f8df8022450 acl_string: pgrp:spec:r-x
2014-08-31
15:00:25.132: [ CRSSEC][733177600]{0:0:2} Exception: ACL entry
creation failed for: pgrp:spec:r-x
2014-08-31
15:00:25.132: [ INIT][733177600]{0:0:2} Dump State Starting ...
7. Only the
ohasd.bin is runing, but there is nothing written in ohasd.log. OS
/var/log/messages shows:
2015-07-12
racnode1 logger: autorun file for ohasd is missing
Possible Causes:
1. For
OL5/RHEL5/under and other platform, the file '/etc/inittab' does not
contain the line similar to the following (platform dependent) :
h1:35:respawn:/etc/init.d/init.ohasd run >/dev/null 2>&1
</dev/null
For OL6/RHEL6+,
upstart is not configed properly.
2. runlevel 3 has
not been reached, some rc3 script is hanging
3. the init process
(pid 1) did not spawn the process defined in /etc/inittab (h1) or a
bad entry before init.ohasd like xx:wait:<process> blocked the
start of init.ohasd
4. CRS autostart is
disabled
5. The Oracle Local
Registry ($GRID_HOME/cdata/<node>.olr) is missing or corrupted
(check as root user via "ocrdump -local /tmp/olr.log", the
/tmp/olr.log should contain all GI daemon processes related
information, compare with a working cluster to verify)
6. root user was in
group "spec" before but now the group "spec" has
been removed, the old group for root user is still recorded in the
OLR, this can be verified in OLR dump
7. HOSTNAME was null
when init.ohasd started especially after a node reboot
Solutions:
1. For OL5/RHEL5 and
under, add the following line to /etc/inittab
h1:35:respawn:/etc/init.d/init.ohasd run >/dev/null 2>&1
</dev/null
and then run
"init q" as the root user.
For Linux
OL6/RHEL6, please refer to Note 1607600.1
2. Run command 'ps
-ef | grep rc' and kill any remaining rc3 scripts that appear to be
stuck.
3. Remove the bad
entry before init.ohasd. Consult with OS vendor if "init q"
does not spawn "init.ohasd run" process. As a workaround,
start the
init.ohasd manually, eg: as root user, run "/etc/init.d/init.ohasd
run >/dev/null 2>&1 </dev/null &"
4. Enable CRS
autostart:
# crsctl enable
crs
# crsctl start
crs
5. Restore OLR from
backup, as root user: (refer to Note 1193643.1)
# crsctl stop crs
-f
# touch
<GRID_HOME>/cdata/<node>.olr
# chown
root:oinstall <GRID_HOME>/cdata/<node>.olr
# ocrconfig
-local -restore <GRID_HOME>/cdata/<node>/backup_<date>_<num>.olr
# crsctl start
crs
If OLR backup does
not exist for any reason, perform deconfig and rerun root.sh is
required to recreate OLR, as root user:
#
<GRID_HOME>/crs/install/rootcrs.pl -deconfig -force
#
<GRID_HOME>/root.sh
6.
Reinitialize/Recreate the OLR is required, using the same command as
recreating OLR per above
7. Restart the
init.ohasd process or add "sleep 30" in init.ohasd to allow
hostname populated correctly before starting Clusterware, refer to
Note 1427234.1
8. If above does not
help, check OS messages for ohasd.bin logger message and manually
execute crswrapexece.pl command mentioned in the OS message with
LD_LIBRARY_PATH set to <GRID_HOME>/lib to continue debug.
Issue #2: CRS-4530:
Communications failure contacting Cluster Synchronization Services
daemon, ocssd.bin is not running
Symptoms:
1. Command
'$GRID_HOME/bin/crsctl check crs' returns errors:
CRS-4638: Oracle
High Availability Services is online
CRS-4535: Cannot
communicate with Cluster Ready Services
CRS-4530:
Communications failure contacting Cluster Synchronization Services
daemon
CRS-4534: Cannot
communicate with Event Manager
2. Command 'ps -ef |
grep d.bin' does not show a line similar to:
oragrid 21543 1
1 22:24 ? 00:00:01 /u01/app/11.2.0/grid/bin/ocssd.bin
3. ocssd.bin is
running but abort with message "CLSGPNP_CALL_AGAIN" in
ocssd.log
4. ocssd.log shows:
2012-01-27
13:42:58.796: [ CSSD][19]clssnmvDHBValidateNCopy: node 1, racnode1,
has a disk HB, but no network HB, DHB has rcfg 223132864, wrtcnt,
1112, LATS 783238209,
lastSeqNo 1111,
uniqueness 1327692232, timestamp 1327693378/787089065
5. for 3 or more
node cases, 2 nodes form cluster fine, the 3rd node joined then
failed, ocssd.log show:
2012-02-09
11:33:53.048: [ CSSD][1120926016](:CSSNM00008:)clssnmCheckDskInfo:
Aborting local node to avoid splitbrain. Cohort of 2 nodes with
leader 2, racnode2, is smaller than
cohort of 2 nodes
led by node 1, racnode1, based on map type 2
2012-02-09
11:33:53.048: [ CSSD][1120926016]###################################
2012-02-09
11:33:53.048: [ CSSD][1120926016]clssscExit: CSSD aborting from
thread clssnmRcfgMgrThread
6. ocssd.bin startup
timeout after 10minutes
2012-04-08
12:04:33.153: [ CSSD][1]clssscmain: Starting CSS daemon, version
11.2.0.3.0, in (clustered) mode with uniqueness value 1333911873
......
2012-04-08
12:14:31.994: [ CSSD][5]clssgmShutDown: Received abortive shutdown
request from client.
2012-04-08
12:14:31.994: [ CSSD][5]###################################
2012-04-08
12:14:31.994: [ CSSD][5]clssscExit: CSSD aborting from thread
GMClientListener
2012-04-08
12:14:31.994: [ CSSD][5]###################################
2012-04-08
12:14:31.994: [ CSSD][5](:CSSSC00012:)clssscExit: A fatal error
occurred and the CSS daemon is terminating abnormally
7. alert<node>.log
shows:
2014-02-05
06:16:56.815
[cssd(3361)]CRS-1714:Unable
to discover any voting files, retrying discovery in 15 seconds;
Details at (:CSSNM00070:) in
/u01/app/11.2.0/grid/log/bdprod2/cssd/ocssd.log
...
2014-02-05
06:27:01.707
[ohasd(2252)]CRS-2765:Resource
'ora.cssdmonitor' has failed on server 'bdprod2'.
2014-02-05
06:27:02.075
[ohasd(2252)]CRS-2771:Maximum
restart attempts reached for resource 'ora.cssd'; will not restart.
Possible Causes:
1. Voting disk is
missing or inaccessible
2. Multicast is not
working for private network for 11.2.0.2.x (expected behavior) or
11.2.0.3 PSU5/PSU6/PSU7 or 12.1.0.1 (due to Bug 16547309)
3. private network
is not working, ping or traceroute <private host> shows
destination unreachable. Or firewall is enable for private network
while ping/traceroute work fine
4. gpnpd does not
come up, stuck in dispatch thread, Bug 10105195
5. too many disks
discovered via asm_diskstring or slow scan of disks due to Bug
13454354 on Solaris 11.2.0.3 only
6. In some cases,
known bug could cause 2nd node ocssd.bin can not join the cluster
after private network issue is fixed, refer to Note 1479380.1
Solutions:
1. restore the
voting disk access by checking storage access, disk permissions etc.
If the disk is
not accessible at OS level, please engage system administrator to
restore the disk access.
If the voting
disk is missing from the OCR ASM diskgroup, start CRS in exclusive
mode and recreate the voting disk:
# crsctl start
crs -excl
# crsctl replace
votedisk <+OCRVOTE diskgroup>
2. Refer to Document
1212703.1 for multicast test and fix. For 11.2.0.3 PSU5/PSU6/PSU7 or
12.1.0.1, either enable multicast for private network or apply patch
16547309 or latest PSU.
3. Consult with the
network administrator to restore private network access or disable
firewall for private network (for Linux, check service iptables
status and service ip6tables status)
4. Kill the
gpnpd.bin process on surviving node, refer Document 10105195.8
Once above issues
are resolved, restart Grid Infrastructure stack.
If
ping/traceroute all work for private network, there is a failed
11.2.0.1 to 11.2.0.2 upgrade happened, please check out
Bug 13416559 for
workaround
5. Limit the number
of ASM disks scan by supplying a more specific asm_diskstring, refer
to bug 13583387
For Solaris
11.2.0.3 only, please apply patch 13250497, see Note 1451367.1.
6. Refer to the
solution and workaround in Note 1479380.1
Issue #3: CRS-4535:
Cannot communicate with Cluster Ready Services, crsd.bin is not
running
Symptoms:
1. Command
'$GRID_HOME/bin/crsctl check crs' returns errors:
CRS-4638: Oracle
High Availability Services is online
CRS-4535: Cannot
communicate with Cluster Ready Services
CRS-4529:
Cluster Synchronization Services is online
CRS-4534: Cannot
communicate with Event Manager
2. Command 'ps -ef |
grep d.bin' does not show a line similar to:
root 23017 1 1
22:34 ? 00:00:00 /u01/app/11.2.0/grid/bin/crsd.bin reboot
3. Even if the
crsd.bin process exists, command 'crsctl stat res -t -init' shows:
ora.crsd
1 ONLINE
INTERMEDIATE
Possible Causes:
1. ocssd.bin is not
running or resource ora.cssd is not ONLINE
2. +ASM<n>
instance can not startup due to various reason
3. OCR is
inaccessible
4. Network
configuration has been changed causing gpnp profile.xml mismatch
5.
$GRID_HOME/crs/init/<host>.pid file for crsd has been removed
or renamed manually, crsd.log shows: 'Error3 -2 writing PID to the
file'
6. ocr.loc content
mismatch with other cluster nodes. crsd.log shows: 'Shutdown
CacheLocal. my hash ids don't match'
7. private network
is pingable with normal ping command but not pingable with jumbo
frame size (eg: ping -s 8900 <private ip>) when jumbo frame is
enabled (MTU: 9000+). Or partial cluster nodes have jumbo frame set
(MTU: 9000) and the problem node does not have jumbo frame set
(MTU:1500)
8. On AIX 6.1 TL08
SP01 and AIX 7.1 TL02 SP01, due to truncation of multicast packets.
9. udp_sendspace is
set to default 9216 on AIX platform
Solutions:
1. Check the
solution for Issue 2, ensure ocssd.bin is running and ora.cssd is
ONLINE
2. For 11.2.0.2+,
ensure that the resource ora.cluster_interconnect.haip is ONLINE,
refer to Document 1383737.1 for ASM startup issues related to HAIP.
Check if
GRID_HOME/bin/oracle binary is linked with RAC option Document
284785.1
3. Ensure the OCR
disk is available and accessible. If the OCR is lost for any reason,
refer to Document 1062983.1 on how to restore the OCR.
4. Restore network
configuration to be the same as interface defined in
$GRID_HOME/gpnp/<node>/profiles/peer/profile.xml, refer to
Document 283684.1 for private network modification.
5. touch the file
with <host>.pid under $GRID_HOME/crs/init.
For 11.2.0.1, the
file is owned by <grid> user.
For 11.2.0.2, the
file is owned by root user.
6. Using ocrconfig
-repair command to fix the ocr.loc content:
for example, as
root user:
# ocrconfig -repair
-add +OCR2 (to add an entry)
# ocrconfig -repair
-delete +OCR2 (to remove an entry)
ohasd.bin needs to
be up and running in order for above command to run.
Once above issues
are resolved, either restart GI stack or start crsd.bin via:
# crsctl start
res ora.crsd -init
7. Engage network
admin to enable jumbo frame from switch layer if it is enabled at the
network interface. If jumbo frame is not required, change MTU to 1500
for the private network on all nodes, then restart GI stack on all
nodes.
8. On AIX 6.1 TL08
SP01 and AIX 7.1 TL02 SP01, apply AIX patch per Document 1528452.1
AIX 6.1 TL8 or 7.1 TL2: 11gR2 GI Second Node Fails to Join the
Cluster as CRSD and EVMD are in INTERMEDIATE State
9. Increase
udp_sendspace to recommended value, refer to Document 1280234.1
Issue #4: Agent or
mdnsd.bin, gpnpd.bin, gipcd.bin not running
Symptoms:
1. orarootagent not
running. ohasd.log shows:
2012-12-21
02:14:05.071: [ AGFW][24] {0:0:2} Created alert : (:CRSAGF00123:)
: Failed to start the agent process:
/grid/11.2.0/grid_2/bin/orarootagent Category: -1 Operation: fail
Loc: canexec2 OS error: 0 Other : no exe permission, file
[/grid/11.2.0/grid_2/bin/orarootagent]
2. mdnsd.bin,
gpnpd.bin or gipcd.bin not running, here is a sample for mdnsd log
file:
2012-12-31
21:37:27.601: [ clsdmt][1088776512]Creating PID [4526] file for home
/u01/app/11.2.0/grid host lc1n1 bin mdns to
/u01/app/11.2.0/grid/mdns/init/
2012-12-31
21:37:27.602: [ clsdmt][1088776512]Error3 -2 writing PID [4526] to
the file []
2012-12-31
21:37:27.602: [ clsdmt][1088776512]Failed to record pid for MDNSD
or
2012-12-31
21:39:52.656: [ clsdmt][1099217216]Creating PID [4645] file for home
/u01/app/11.2.0/grid host lc1n1 bin mdns to
/u01/app/11.2.0/grid/mdns/init/
2012-12-31
21:39:52.656: [ clsdmt][1099217216]Writing PID [4645] to the file
[/u01/app/11.2.0/grid/mdns/init/lc1n1.pid]
2012-12-31
21:39:52.656: [ clsdmt][1099217216]Failed to record pid for MDNSD
3. oraagent or
appagent not running, crsd.log shows:
2012-12-01
00:06:24.462: [ AGFW][1164069184] {0:2:27} Created alert :
(:CRSAGF00130:) : Failed to start the agent
/u01/app/grid/11.2.0/bin/appagent_oracle
Possible Causes:
1. orarootagent
missing execute permission
2. missing process
associated <node>.pid file or the file has wrong ownership or
permission
3. wrong
permission/ownership within GRID_HOME
4. GRID_HOME disk
space 100% full
Solutions:
1. Either compare
the permission/ownership with a good node GRID_HOME and make
correction accordingly or as root user:
# cd
<GRID_HOME>/crs/install
# ./rootcrs.pl
-unlock
# ./rootcrs.pl
-patch
This will stop
clusterware stack, set permssion/owership to root for required files
and restart clusterware stack.
2. If the
corresponding <node>.pid does not exist, touch the file with
correct ownership and permission, otherwise correct the <node>.pid
ownership/permission as required, then restart the clusterware stack.
Here is the list of
<node>.pid file under <GRID_HOME>, owned by root:root,
permission 644:
./ologgerd/init/<node>.pid
./osysmond/init/<node>.pid
./ctss/init/<node>.pid
./ohasd/init/<node>.pid
./crs/init/<node>.pid
Owned by
<grid>:oinstall, permission 644:
./mdns/init/<node>.pid
./evm/init/<node>.pid
./gipc/init/<node>.pid
./gpnp/init/<node>.pid
3. For cause 3,
please refer to solution 1.
4. Please clean up
the disk space from GRID_HOME, particularly clean up old files under
<GRID_HOME>/log/<node>/client/, <diag
dest>/tnslsnr/<node>/<listener name>/alert/
Issue #5: ASM
instance does not start, ora.asm is OFFLINE
Symptoms:
1. Command 'ps -ef |
grep asm' shows no ASM processes
2. Command 'crsctl
stat res -t -init' shows:
ora.asm
1
ONLINE OFFLINE
Possible Causes:
1. ASM spfile is
corrupted
2. ASM discovery
string is incorrect and therefore voting disk/OCR cannot be
discovered
3. ASMlib
configuration problem
4. ASM instances are
using different cluster_interconnect, HAIP OFFLINE on 1 node causing
the 2nd ASM instance could not start
Solutions:
1. Create a
temporary pfile to start ASM instance, then recreate spfile, see
Document 1095214.1 for more details.
2. Refer to Document
1077094.1 to correct the ASM discovery string.
3. Refer to Document
1050164.1 to fix ASMlib configuration.
4. Refer to Document
1383737.1 for solution. For more information about HAIP, please refer
to Document 1210883.1
============
Oracle Clusterware
Cannot Start on all Nodes: Network communication with node <NAME>
missing for 90% of timeout interval (Doc ID 1507482.1) To BottomTo
Bottom
Applies to:
Oracle Database -
Enterprise Edition - Version 11.2.0.1 and later
Information in this
document applies to any platform.
Purpose
This note is a
troubleshooting guide for the following situation: Oracle
Clusterware cannot be started on all nodes at once. For example, in
a 2-node cluster, the Oracle Clusterware on the 2nd node won't start,
or, attempting to start clusterware on the second node causes the
first node's clusterware to shutdown.
In the clusterware
alert log ($GRID_HOME/log/<hostname>/alert<hostname>.log)
of one or more nodes where Oracle Clusterware is started, the
following messages are seen:
2012-07-14
19:24:18.420
[cssd(6192)]CRS-1612:Network
communication with node racnode02 (2) missing for 50% of timeout
interval. Removal of this node from cluster in 14.500 seconds
2012-07-14
19:24:25.422
[cssd(6192)]CRS-1611:Network
communication with node racnode02 (2) missing for 75% of timeout
interval. Removal of this node from cluster in 7.500 seconds
2012-07-14
19:24:30.424
[cssd(6192)]CRS-1610:Network
communication with node racnode02 (2) missing for 90% of timeout
interval. Removal of this node from cluster in 2.500 seconds
2012-07-14
19:24:32.925
[cssd(6192)]CRS-1607:Node
racnode02 is being evicted in cluster incarnation 179915229; details
at (:CSSNM00007:) in /u01/app/gridhome/log/racnode01/cssd/ocssd.log.
In the clusterware
alert log ($GRID_HOME/log/<hostname>/alert<hostname>.log)
of the evicted node(s), the following messages are seen:
2012-07-14
19:24:29.282
[cssd(8625)]CRS-1608:This
node was evicted by node 1, racnode01; details at (:CSSNM00005:) in
/u01/app/gridhome/log/racnode02/cssd/ocssd.log.
2012-07-14
19:24:29.282
[cssd(8625)]CRS-1656:The
CSS daemon is terminating due to a fatal error; Details at
(:CSSSC00012:) in /u01/app/gridhome/log/racnode02/cssd/ocssd.log
Troubleshooting
Steps
The Oracle
clusterware cannot be up on two (or more) nodes if those nodes cannot
communicate with each other over the interconnect.
The CRS-1612,
CRS-1611, CRS-1610 messages "Network communication with node
NAME(n) missing for PCT% of timeout interval" are warning that
ocssd on that node cannot communicate with ocssd on the other node(s)
over the interconnect. If this persists for the full timeout interval
(usually thirty seconds - reference: Document 294430.1) then Oracle
Clusteware is designed to evict one of the nodes.
Therefore, the issue
that requires troubleshooting in such as case is: why the nodes
cannot communicate over the interconnect
===== Clusterware do
not start on ALL nodes after reboot (Doc ID 1676719.1) ========
Applies to:
Oracle Database -
Enterprise Edition - Version 11.2.0.1 to 11.2.0.4 [Release 11.2]
Information in this
document applies to any platform.
Symptoms
Clusterware fails to
start on ALL nodes after the cluster nodes are rebooted. Nodes in
cluster are mdbp01,mdbp02,mdbp03 and mdbp04 with cluster name as
"crs".
$GRID_HOME/log/<node>/gpnpd/gpnpd.log
report below errors:
2014-01-23
01:56:50.376: [ CLSXSEC][7]clsxsec_CtxCKVerify: [at clsxsec.c:1768]
Result: (10017) CLSXERR_SEC_BSAFE_DATA. Failed to verify: nzerr=29237
vstat=2
2014-01-23
01:56:50.376: [ GPNP][7]clsgpnpd_validateProfile: [at
clsgpnpd.c:2888] Result: (89) CLSGPNP_SIG_WALLETDIF. Profile failed
to verify. prf=6000000000d99e30
2014-01-23
01:56:50.376: [ GPNP][7]clsgpnpd_putProfileDo: [at
clsgpnpd.c:5336] Result: (89) CLSGPNP_SIG_WALLETDIF. PUT>>>
REFUSED best p=6000000000d99e30 from "tcp://rdbp08:10653"
2014-01-23
01:56:50.378: [ GPNP][7]clsgpnp_profileCallUrlInt: [at
clsgpnp.c:2104] put-profile call to url "tcp://rdbp06:50099"
disco
"mdns:service:gpnp._tcp.local.://rdbp06:50099/agent=gpnpd,cname=crs,host=rdbp06,pid=6049/gpnpd
h:rdbp06 c:crs" [f=0 claimed- host:mdbp01 cname:crs seq:6
auth:CN=GPnP_peer]
2014-01-23
01:56:52.111: [ OCRMSG][3]GIPC error [29] msg
[gipcretConnectionRefused]
2014-01-23
01:57:06.251: [ OCRMSG][3]GIPC error [29] msg
[gipcretConnectionRefused]
2014-01-23
01:57:27.462: [ OCRMSG][3]GIPC error [29] msg
[gipcretConnectionRefused]
$GRID_HOME/log/<node>/cssd/ocssd.log shows:
2014-01-23
02:37:21.387: [ CSSD][5]clssgmEvtInformation: reqtype (11) req
(6000000000c081f0)
2014-01-23
02:37:21.387: [ CSSD][5]clssnmQueueNotification: type (11)
6000000000c081f0
2014-01-23
02:37:22.639: [ GPNP][1]clsgpnpm_newWiredMsg: [at clsgpnpm.c:741]
Msg-reply has soap fault 10 (Operation returned Retry (error
CLSGPNP_CALL_AGAIN)) [uri
http://www.grid-pnp.org/2005/12/gpnp-errors#"]
2014-01-23
02:37:24.659: [ GPNP][1]clsgpnpm_newWiredMsg: [at clsgpnpm.c:741]
Msg-reply has soap fault 10 (Operation returned Retry (error
CLSGPNP_CALL_AGAIN)) [uri
http://www.grid-pnp.org/2005/12/gpnp-errors#"]
2014-01-23
02:37:26.679: [ GPNP][1]clsgpnpm_newWiredMsg: [at clsgpnpm.c:741]
Msg-reply has soap fault 10 (Operation returned Retry (error
CLSGPNP_CALL_AGAIN)) [uri
http://www.grid-pnp.org/2005/12/gpnp-errors#"]
...
...
2014-01-23
02:47:18.568: [ GPNP][1]clsgpnpm_newWiredMsg: [at clsgpnpm.c:741]
Msg-reply has soap fault 10 (Operation returned Retry (error
CLSGPNP_CALL_AGAIN)) [uri
http://www.grid-pnp.org/2005/12/gpnp-errors#"]
2014-01-23
02:47:19.377: [ CSSD][5]clssgmExecuteClientRequest: MAINT recvd
from proc 3 (6000000000c1bb00)
2014-01-23
02:47:19.377: [ CSSD][5]clssgmShutDown: Received abortive shutdown
request from client.
2014-01-23
02:47:19.377: [ CSSD][5]###################################
2014-01-23
02:47:19.377: [ CSSD][5]clssscExit: CSSD aborting from thread
GMClientListener
2014-01-23
02:47:19.377: [ CSSD][5]###################################
2014-01-23
02:47:19.377: [ CSSD][5](:CSSSC00012:)clssscExit: A fatal error
occurred and the CSS daemon is terminating abnormally
Cause
Another cluster is
using the same cluster name "crs", causes gpnpd from this
cluster (contains nodes mdbp01,mdbp02,mdbp03 and mdbp04) trying to
get gpnp profile from the other cluster (contains nodes
rdbp04,rdbp06,rdbp08 and rdbp09), as they do not belong to the same
cluster, this leads to profile validation failure, Grid
Infrastructure can not start.
Solution
"Cluster name"
should be unique across clusters. To fix the issue, clustername can
be changed by following below steps:
1. On all remote
nodes of the mdb* cluster (contains nodes mdbp01,mdbp02,mdbp03 and
mdbp04), as root user execute:
#
<$GRID_HOME>/crs/install/rootcrs.pl -deconfig -force -verbose
2. Once the above
command finishes on all remote nodes, on local node of mdb* cluster,
as root user execute:
#
<$GRID_HOME>/crs/install/rootcrs.pl -deconfig -force -verbose
-keepdg -lastnode
3. Reconfigure and
change the "Cluster name" by running
$GRID_HOME/crs/config/config.sh, refer to note 1354258.1 for details
4. Run root.sh as
prompted on each node to complete the configuration.
Note 1, from 12.1,
cluster GUID is used for cluster node discovery, hence cluster name
will not be an issue.
Note 2, the solution
also provides the steps for how to change the cluster name in a RAC
environment
=========================
This document is
intended for Clusterware/RAC Database Administrators and Oracle
support engineers.
Details
Start up sequence:
In a nutshell, the
operating system starts ohasd, ohasd starts agents to start up
daemons (gipcd, mdnsd, gpnpd, ctssd, ocssd, crsd, evmd asm etc), and
crsd starts agents that start user resources (database, SCAN,
listener etc).
For detailed Grid
Infrastructure clusterware startup sequence, please refer to note
1053147.1
Cluster status
To find out cluster
and daemon status:
$GRID_HOME/bin/crsctl
check crs
CRS-4638: Oracle
High Availability Services is online
CRS-4537: Cluster
Ready Services is online
CRS-4529: Cluster
Synchronization Services is online
CRS-4533: Event
Manager is online
$GRID_HOME/bin/crsctl
stat res -t -init
--------------------------------------------------------------------------------
NAME
TARGET STATE SERVER STATE_DETAILS
--------------------------------------------------------------------------------
Cluster Resources
--------------------------------------------------------------------------------
ora.asm
1
ONLINE ONLINE rac1 Started
ora.crsd
1
ONLINE ONLINE rac1
ora.cssd
1
ONLINE ONLINE rac1
ora.cssdmonitor
1
ONLINE ONLINE rac1
ora.ctssd
1
ONLINE ONLINE rac1 OBSERVER
ora.diskmon
1
ONLINE ONLINE rac1
ora.drivers.acfs
1
ONLINE ONLINE rac1
ora.evmd
1
ONLINE ONLINE rac1
ora.gipcd
1
ONLINE ONLINE rac1
ora.gpnpd
1
ONLINE ONLINE rac1
ora.mdnsd
1
ONLINE ONLINE rac1
For 11.2.0.2 and
above, there will be two more processes:
ora.cluster_interconnect.haip
1
ONLINE ONLINE rac1
ora.crf
1
ONLINE ONLINE rac1
For 11.2.0.3 onward
in non-Exadata, ora.diskmon will be offline:
ora.diskmon
1
OFFLINE OFFLINE rac1
For 12c onward,
ora.storage is introduced:
ora.storage
1 ONLINE ONLINE
racnode1 STABLE
To start an offline
daemon - if ora.crsd is OFFLINE:
$GRID_HOME/bin/crsctl
start res ora.crsd -init
Case 1: OHASD does
not start
As ohasd.bin is
responsible to start up all other cluserware processes directly or
indirectly, it needs to start up properly for the rest of the stack
to come up. If ohasd.bin is not up, when checking its status,
CRS-4639 (Could not contact Oracle High Availability Services) will
be reported; and if ohasd.bin is already up, CRS-4640 will be
reported if another start up attempt is made; if it fails to start,
the following will be reported:
CRS-4124: Oracle
High Availability Services startup failed.
CRS-4000: Command
Start failed, or completed with errors.
Automatic ohasd.bin
start up depends on the following:
1. OS is at
appropriate run level:
OS need to be at
specified run level before CRS will try to start up.
To find out at which
run level the clusterware needs to come up:
cat
/etc/inittab|grep init.ohasd
h1:35:respawn:/etc/init.d/init.ohasd
run >/dev/null 2>&1 </dev/null
Note: Oracle Linux 6
(OL6) or Red Hat Linux 6 (RHEL6) has deprecated inittab, rather,
init.ohasd will be configured via upstart in
/etc/init/oracle-ohasd.conf, however, the process
""/etc/init.d/init.ohasd run" should still be up.
Oracle Linux 7 (and Red Hat Linux 7) uses systemd to manage
start/stop services (example:
/etc/systemd/system/oracle-ohasd.service)
Above example shows
CRS suppose to run at run level 3 and 5; please note depend on
platform, CRS comes up at different run level.
To find out current
run level:
who -r
2. "init.ohasd
run" is up
On Linux/UNIX, as
"init.ohasd run" is configured in /etc/inittab, process
init (pid 1, /sbin/init on Linux, Solaris and hp-ux, /usr/sbin/init
on AIX) will start and respawn "init.ohasd run" if it
fails. Without "init.ohasd run" up and running, ohasd.bin
will not start:
ps -ef|grep
init.ohasd|grep -v grep
root 2279 1
0 18:14 ? 00:00:00 /bin/sh /etc/init.d/init.ohasd run
Note: Oracle Linux 6
(OL6) or Red Hat Linux 6 (RHEL6) has deprecated inittab, rather,
init.ohasd will be configured via upstart in
/etc/init/oracle-ohasd.conf, however, the process
""/etc/init.d/init.ohasd run" should still be up.
If any rc Snncommand
script (located in rcn.d, example S98gcstartup) stuck, init process
may not start "/etc/init.d/init.ohasd run"; please engage
OS vendor to find out why relevant Snncommand script stuck.
Error
"[ohasd(<pid>)] CRS-0715:Oracle High Availability Service
has timed out waiting for init.ohasd to be started." may be
reported of init.ohasd fails to start on time.
If SA can not
identify the reason why init.ohasd is not starting, the following can
be a very short term workaround:
cd
<location-of-init.ohasd>
nohup ./init.ohasd
run &
3. Cluserware auto
start is enabled - it's enabled by default
By default CRS is
enabled for auto start upon node reboot, to enable:
$GRID_HOME/bin/crsctl
enable crs
To verify whether
it's currently enabled or not:
$GRID_HOME/bin/crsctl
config crs
If the following is
in OS messages file
Feb 29 16:20:36
racnode1 logger: Oracle Cluster Ready Services startup disabled.
Feb 29 16:20:36
racnode1 logger: Could not access
/var/opt/oracle/scls_scr/racnode1/root/ohasdstr
The reason is the
file does not exist or not accessible, cause can be someone modified
it manually or wrong opatch is used to apply a GI patch(i.e. opatch
for Solaris X64 used to apply patch on Linux).
4. syslogd is up and
OS is able to execute init script S96ohasd
OS may stuck with
some other Snn script while node is coming up, thus never get chance
to execute S96ohasd; if that's the case, following message will not
be in OS messages:
Jan 20 20:46:51 rac1
logger: Oracle HA daemon is enabled for autostart.
If you don't see
above message, the other possibility is syslogd(/usr/sbin/syslogd) is
not fully up. Grid may fail to come up in that case as well. This may
not apply to AIX.
To find out whether
OS is able to execute S96ohasd while node is coming up, modify
S96ohasd:
From:
case `$CAT
$AUTOSTARTFILE` in
enable*)
$LOGERR
"Oracle HA daemon is enabled for autostart."
To:
case `$CAT
$AUTOSTARTFILE` in
enable*)
/bin/touch
/tmp/ohasd.start."`date`"
$LOGERR
"Oracle HA daemon is enabled for autostart."
After a node reboot,
if you don't see /tmp/ohasd.start.timestamp get created, it means OS
stuck with some other Snn script. If you do see
/tmp/ohasd.start.timestamp but not "Oracle HA daemon is enabled
for autostart" in messages, likely syslogd is not fully up. For
both case, you will need engage System Administrator to find out the
issue on OS level. For latter case, the workaround is to "sleep"
for about 2 minutes, modify ohasd:
From:
case `$CAT
$AUTOSTARTFILE` in
enable*)
$LOGERR
"Oracle HA daemon is enabled for autostart."
To:
case `$CAT
$AUTOSTARTFILE` in
enable*)
/bin/sleep
120
$LOGERR
"Oracle HA daemon is enabled for autostart."
5. File System that
GRID_HOME resides is online when init script S96ohasd is executed;
once S96ohasd is executed, following message should be in OS messages
file:
Jan 20 20:46:51 rac1
logger: Oracle HA daemon is enabled for autostart.
..
Jan 20 20:46:57 rac1
logger: exec /ocw/grid/perl/bin/perl -I/ocw/grid/perl/lib
/ocw/grid/bin/crswrapexece.pl
/ocw/grid/crs/install/s_crsconfig_rac1_env.txt
/ocw/grid/bin/ohasd.bin "reboot"
If you see the first
line, but not the last line, likely the filesystem containing the
GRID_HOME was not online while S96ohasd is executed.
6. Oracle Local
Registry (OLR, $GRID_HOME/cdata/${HOSTNAME}.olr) is accessible and
valid
ls -l
$GRID_HOME/cdata/*.olr
-rw------- 1 root
oinstall 272756736 Feb 2 18:20 rac1.olr
If the OLR is
inaccessible or corrupted, likely ohasd.log will have similar
messages like following:
..
2010-01-24
22:59:10.470: [ default][1373676464] Initializing OLR
2010-01-24
22:59:10.472: [ OCROSD][1373676464]utopen:6m':failed in stat OCR
file/disk /ocw/grid/cdata/rac1.olr, errno=2, os err string=No such
file or directory
2010-01-24
22:59:10.472: [ OCROSD][1373676464]utopen:7:failed to open any OCR
file/disk, errno=2, os err string=No such file or directory
2010-01-24
22:59:10.473: [ OCRRAW][1373676464]proprinit: Could not open raw
device
2010-01-24
22:59:10.473: [ OCRAPI][1373676464]a_init:16!: Backend init
unsuccessful : [26]
2010-01-24
22:59:10.473: [ CRSOCR][1373676464] OCR context init failure.
Error: PROCL-26: Error while accessing the physical storage Operating
System error [No such file or directory] [2]
2010-01-24
22:59:10.473: [ default][1373676464] OLR initalization failured,
rc=26
2010-01-24
22:59:10.474: [ default][1373676464]Created alert : (:OHAS00106:) :
Failed to initialize Oracle Local Registry
2010-01-24
22:59:10.474: [ default][1373676464][PANIC] OHASD exiting; Could not
init OLR
OR
..
2010-01-24
23:01:46.275: [ OCROSD][1228334000]utread:3: Problem reading buffer
1907f000 buflen 4096 retval 0 phy_offset 102400 retry 5
2010-01-24
23:01:46.275: [ OCRRAW][1228334000]propriogid:1_1: Failed to read
the whole bootblock. Assumes invalid format.
2010-01-24
23:01:46.275: [ OCRRAW][1228334000]proprioini: all disks are not
OCR/OLR formatted
2010-01-24
23:01:46.275: [ OCRRAW][1228334000]proprinit: Could not open raw
device
2010-01-24
23:01:46.275: [ OCRAPI][1228334000]a_init:16!: Backend init
unsuccessful : [26]
2010-01-24
23:01:46.276: [ CRSOCR][1228334000] OCR context init failure.
Error: PROCL-26: Error while accessing the physical storage
2010-01-24
23:01:46.276: [ default][1228334000] OLR initalization failured,
rc=26
2010-01-24
23:01:46.276: [ default][1228334000]Created alert : (:OHAS00106:) :
Failed to initialize Oracle Local Registry
2010-01-24
23:01:46.277: [ default][1228334000][PANIC] OHASD exiting; Could not
init OLR
OR
..
2010-11-07
03:00:08.932: [ default][1] Created alert : (:OHAS00102:) : OHASD is
not running as privileged user
2010-11-07
03:00:08.932: [ default][1][PANIC] OHASD exiting: must be run as
privileged user
OR
ohasd.bin comes up
but output of "crsctl stat res -t -init"shows no resource,
and "ocrconfig -local -manualbackup" fails
OR
..
2010-08-04
13:13:11.102: [ CRSPE][35] Resources parsed
2010-08-04
13:13:11.103: [ CRSPE][35] Server [] has been registered with the
PE data model
2010-08-04
13:13:11.103: [ CRSPE][35] STARTUPCMD_REQ = false:
2010-08-04
13:13:11.103: [ CRSPE][35] Server [] has changed state from
[Invalid/unitialized] to [VISIBLE]
2010-08-04
13:13:11.103: [ CRSOCR][31] Multi Write Batch processing...
2010-08-04
13:13:11.103: [ default][35] Dump State Starting ...
..
2010-08-04
13:13:11.112: [ CRSPE][35] SERVERS:
:VISIBLE:address{{Absolute|Node:0|Process:-1|Type:1}};
recovered state:VISIBLE. Assigned to no pool
------------- SERVER
POOLS:
Free
[min:0][max:-1][importance:0] NO SERVERS ASSIGNED
2010-08-04
13:13:11.113: [ CRSPE][35] Dumping ICE contents...:ICE operation
count: 0
2010-08-04
13:13:11.113: [ default][35] Dump State Done.
The solution is to
restore a good backup of OLR with "ocrconfig -local -restore
<ocr_backup_name>".
By default, OLR will
be backed up to $GRID_HOME/cdata/$HOST/backup_$TIME_STAMP.olr once
installation is complete.
7. ohasd.bin is able
to access network socket files:
2010-06-29
10:31:01.570: [ COMMCRS][1206901056]clsclisten: Permission denied for
(ADDRESS=(PROTOCOL=ipc)(KEY=procr_local_conn_0_PROL))
2010-06-29
10:31:01.571: [ OCRSRV][1217390912]th_listen: CLSCLISTEN failed
clsc_ret= 3, addr=
[(ADDRESS=(PROTOCOL=ipc)(KEY=procr_local_conn_0_PROL))]
2010-06-29
10:31:01.571: [ OCRSRV][3267002960]th_init: Local listener did not
reach valid state
In Grid
Infrastructure cluster environment, ohasd related socket files should
be owned by root, but in Oracle Restart environment, they should be
owned by grid user, refer to "Network Socket File Location,
Ownership and Permission" section for example output.
8. ohasd.bin is able
to access log file location:
OS messages/syslog
shows:
Feb 20 10:47:08
racnode1 OHASD[9566]: OHASD exiting; Directory
/ocw/grid/log/racnode1/ohasd not found.
Refer to "Log
File Location, Ownership and Permission" section for example
output, if the expected directory is missing, create it with proper
ownership and permission.
9. After node
reboot, ohasd may fail to start on SUSE Linux after node reboot,
refer to note 1325718.1 - OHASD not Starting After Reboot on SLES
10. OHASD fails to
start, "ps -ef| grep ohasd.bin" shows ohasd.bin is started,
but nothing in $GRID_HOME/log/<node>/ohasd/ohasd.log for many
minutes, truss shows it is looping to close non-opened file handles:
..
15058/1:
0.1995 close(2147483646) Err#9 EBADF
15058/1:
0.1996 close(2147483645) Err#9 EBADF
..
Call stack of
ohasd.bin from pstack shows the following:
_close
sclssutl_closefiledescriptors main ..
The cause is bug
11834289 which is fixed in 11.2.0.3 and above, other symptoms of the
bug is clusterware processes may fail to start with same call stack
and truss output (looping on OS call "close"). If the bug
happens when trying to start other resources, "CRS-5802: Unable
to start the agent process" could show up as well.
11. Other potential
causes/solutions listed in note 1069182.1 - OHASD Failed to Start:
Inappropriate ioctl for device
12. ohasd.bin
started fine, however, "crsctl check crs" shows only the
following and nothing else:
CRS-4638: Oracle
High Availability Services is online
And "crsctl
stat res -p -init" shows nothing
The cause is that
OLR is corrupted, refer to note 1193643.1 to restore.
13. On EL7/OL7: note
1959008.1 - Install of Clusterware fails while running root.sh on OL7
- ohasd fails to start
14. For EL7/OL7,
patch 25606616 is needed: TRACKING BUG TO PROVIDE GI FIXES FOR OL7
15. If ohasd still
fails to start, refer to ohasd.log in
<grid-home>/log/<nodename>/ohasd/ohasd.log and
ohasdOUT.log
Case 2: OHASD Agents
do not start
OHASD.BIN will spawn
four agents/monitors to start resource:
oraagent:
responsible for ora.asm, ora.evmd, ora.gipcd, ora.gpnpd, ora.mdnsd
etc
orarootagent:
responsible for ora.crsd, ora.ctssd, ora.diskmon, ora.drivers.acfs
etc
cssdagent /
cssdmonitor: responsible for ora.cssd(for ocssd.bin) and
ora.cssdmonitor(for cssdmonitor itself)
If ohasd.bin can not
start any of above agents properly, clusterware will not come to
healthy state.
1. Common causes of
agent failure are that the log file or log directory for the agents
don't have proper ownership or permission.
Refer to below
section "Log File Location, Ownership and Permission" for
general reference.
One example is
"rootcrs.pl -patch/postpatch" wasn't executed while
patching manually resulting in agent start failure:
2015-02-25
15:43:54.350806 : CRSMAIN:3294918400: {0:0:2} {0:0:2} Created alert :
(:CRSAGF00123:) : Failed to start the agent process:
/ocw/grid/bin/orarootagent Category: -1 Operation: fail Loc: canexec2
OS error: 0 Other : no exe permission, file
[/ocw/grid/bin/orarootagent]
2015-02-25
15:43:54.382154 : CRSMAIN:3294918400: {0:0:2} {0:0:2} Created alert :
(:CRSAGF00123:) : Failed to start the agent process:
/ocw/grid/bin/cssdagent Category: -1 Operation: fail Loc: canexec2 OS
error: 0 Other : no exe permission, file [/ocw/grid/bin/cssdagent]
2015-02-25
15:43:54.384105 : CRSMAIN:3294918400: {0:0:2} {0:0:2} Created alert :
(:CRSAGF00123:) : Failed to start the agent process:
/ocw/grid/bin/cssdmonitor Category: -1 Operation: fail Loc: canexec2
OS error: 0 Other : no exe permission, file
[/ocw/grid/bin/cssdmonitor]
The solution is to
execute the missed steps.
2. If agent binary
(oraagent.bin or orarootagent.bin etc) is corrupted, agent will not
start resulting in related resources not coming up:
2011-05-03
11:11:13.189
[ohasd(25303)]CRS-5828:Could
not start agent '/ocw/grid/bin/orarootagent_grid'. Details at
(:CRSAGF00130:) {0:0:2} in /ocw/grid/log/racnode1/ohasd/ohasd.log.
2011-05-03
12:03:17.491: [ AGFW][1117866336] {0:0:184} Created alert :
(:CRSAGF00130:) : Failed to start the agent
/ocw/grid/bin/orarootagent_grid
2011-05-03
12:03:17.491: [ AGFW][1117866336] {0:0:184} Agfw Proxy Server
sending the last reply to PE for message:RESOURCE_START[ora.diskmon 1
1] ID 4098:403
2011-05-03
12:03:17.491: [ AGFW][1117866336] {0:0:184} Can not stop the
agent: /ocw/grid/bin/orarootagent_grid because pid is not initialized
..
2011-05-03
12:03:17.492: [ CRSPE][1128372576] {0:0:184} Fatal Error from AGFW
Proxy: Unable to start the agent process
2011-05-03
12:03:17.492: [ CRSPE][1128372576] {0:0:184} CRS-2674: Start of
'ora.diskmon' on 'racnode1' failed
..
2011-06-27
22:34:57.805: [ AGFW][1131669824] {0:0:2} Created alert :
(:CRSAGF00123:) : Failed to start the agent process:
/ocw/grid/bin/cssdagent Category: -1 Operation: fail Loc: canexec2 OS
error: 0 Other : no exe permission, file [/ocw/grid/bin/cssdagent]
2011-06-27
22:34:57.805: [ AGFW][1131669824] {0:0:2} Created alert :
(:CRSAGF00126:) : Agent start failed
..
2011-06-27
22:34:57.806: [ AGFW][1131669824] {0:0:2} Created alert :
(:CRSAGF00123:) : Failed to start the agent process:
/ocw/grid/bin/cssdmonitor Category: -1 Operation: fail Loc: canexec2
OS error: 0 Other : no exe permission, file
[/ocw/grid/bin/cssdmonitor]
The solution is to
compare agent binary with a "good" node, and restore a good
copy.
truss/strace of
ohasd shows agent binary is corrupted
32555
17:38:15.953355 execve("/ocw/grid/bin/orarootagent.bin",
["/opt/grid/product/112020/grid/bi"...],
[/* 38 vars */]) = 0
..
32555
17:38:15.954151 --- SIGBUS (Bus error) @ 0 (0) ---
3. Agent may fail to
start due to bug 11834289 with error "CRS-5802: Unable to start
the agent process", refer to Section "OHASD does not start"
#10 for details.
4. Refer to: note
1964240.1 - CRS-5823:Could not initialize agent framework
Case 3: OCSSD.BIN
does not start
Successful cssd.bin
startup depends on the following:
1. GPnP profile is
accessible - gpnpd needs to be fully up to serve profile
If ocssd.bin is able
to get the profile successfully, likely ocssd.log will have similar
messages like following:
2010-02-02
18:00:16.251: [ GPnP][408926240]clsgpnpm_exchange: [at
clsgpnpm.c:1175] Calling "ipc://GPNPD_rac1", try 4 of
500...
2010-02-02
18:00:16.263: [ GPnP][408926240]clsgpnp_profileVerifyForCall: [at
clsgpnp.c:1867] Result: (87) CLSGPNP_SIG_VALPEER. Profile verified.
prf=0x165160d0
2010-02-02
18:00:16.263: [ GPnP][408926240]clsgpnp_profileGetSequenceRef: [at
clsgpnp.c:841] Result: (0) CLSGPNP_OK. seq of p=0x165160d0 is '6'=6
2010-02-02
18:00:16.263: [ GPnP][408926240]clsgpnp_profileCallUrlInt: [at
clsgpnp.c:2186] Result: (0) CLSGPNP_OK. Successful get-profile CALL
to remote "ipc://GPNPD_rac1" disco ""
Otherwise messages
like following will show in ocssd.log
2010-02-03
22:26:17.057: [ GPnP][3852126240]clsgpnpm_connect: [at
clsgpnpm.c:1100] GIPC gipcretConnectionRefused (29)
gipcConnect(ipc-ipc://GPNPD_rac1)
2010-02-03
22:26:17.057: [ GPnP][3852126240]clsgpnpm_connect: [at
clsgpnpm.c:1101] Result: (48) CLSGPNP_COMM_ERR. Failed to connect to
call url "ipc://GPNPD_rac1"
2010-02-03
22:26:17.057: [ GPnP][3852126240]clsgpnp_getProfileEx: [at
clsgpnp.c:546] Result: (13) CLSGPNP_NO_DAEMON. Can't get GPnP service
profile from local GPnP daemon
2010-02-03
22:26:17.057: [ default][3852126240]Cannot get GPnP profile. Error
CLSGPNP_NO_DAEMON (GPNPD daemon is not running).
2010-02-03
22:26:17.057: [ CSSD][3852126240]clsgpnp_getProfile failed, rc(13)
The solution is to
ensure gpnpd is up and running properly.
2. Voting Disk is
accessible
In 11gR2, ocssd.bin
discover voting disk with setting from GPnP profile, if not enough
voting disks can be identified, ocssd.bin will abort itself.
2010-02-03
22:37:22.212: [ CSSD][2330355744]clssnmReadDiscoveryProfile:
voting file discovery string(/share/storage/di*)
..
2010-02-03
22:37:22.227: [ CSSD][1145538880]clssnmvDiskVerify: Successful
discovery of 0 disks
2010-02-03
22:37:22.227: [ CSSD][1145538880]clssnmCompleteInitVFDiscovery:
Completing initial voting file discovery
2010-02-03
22:37:22.227: [ CSSD][1145538880]clssnmvFindInitialConfigs: No
voting files found
2010-02-03
22:37:22.228: [
CSSD][1145538880]###################################
2010-02-03
22:37:22.228: [ CSSD][1145538880]clssscExit: CSSD signal 11 in
thread clssnmvDDiscThread
ocssd.bin may not
come up with the following error if all nodes failed while there's a
voting file change in progress:
2010-05-02
03:11:19.033: [ CSSD][1197668093]clssnmCompleteInitVFDiscovery:
Detected voting file add in progress for CIN 0:1134513465:0, waiting
for configuration to complete 0:1134513098:0
The solution is to
start ocssd.bin in exclusive mode with note 1364971.1
If the voting disk
is located on a non-ASM device, ownership and permissions should be:
-rw-r----- 1 ogrid
oinstall 21004288 Feb 4 09:13 votedisk1
3. Network is
functional and name resolution is working:
If ocssd.bin can't
bind to any network, likely the ocssd.log will have messages like
following:
2010-02-03
23:26:25.804: [GIPCXCPT][1206540320]gipcmodGipcPassInitializeNetwork:
failed to find any interfaces in clsinet, ret gipcretFail (1)
2010-02-03
23:26:25.804: [GIPCGMOD][1206540320]gipcmodGipcPassInitializeNetwork:
EXCEPTION[ ret gipcretFail (1) ] failed to determine host from
clsinet, using default
..
2010-02-03
23:26:25.810: [ CSSD][1206540320]clsssclsnrsetup: gipcEndpoint
failed, rc 39
2010-02-03
23:26:25.811: [ CSSD][1206540320]clssnmOpenGIPCEndp: failed to
listen on gipc addr gipc://rac1:nm_eotcs- ret 39
2010-02-03
23:26:25.811: [ CSSD][1206540320]clssscmain: failed to open gipc
endp
If there's
connectivity issue on private network (including multicast is off),
likely the ocssd.log will have messages like following:
2010-09-20
11:52:54.014: [ CSSD][1103055168]clssnmvDHBValidateNCopy: node 1,
racnode1, has a disk HB, but no network HB, DHB has rcfg 180441784,
wrtcnt, 453, LATS 328297844, lastSeqNo 452, uniqueness 1284979488,
timestamp 1284979973/329344894
2010-09-20
11:52:54.016: [ CSSD][1078421824]clssgmWaitOnEventValue: after
CmInfo State val 3, eval 1 waited 0
.. >>>>
after a long delay
2010-09-20
12:02:39.578: [ CSSD][1103055168]clssnmvDHBValidateNCopy: node 1,
racnode1, has a disk HB, but no network HB, DHB has rcfg 180441784,
wrtcnt, 1037, LATS 328883434, lastSeqNo 1036, uniqueness 1284979488,
timestamp 1284980558/329930254
2010-09-20
12:02:39.895: [ CSSD][1107286336]clssgmExecuteClientRequest: MAINT
recvd from proc 2 (0xe1ad870)
2010-09-20
12:02:39.895: [ CSSD][1107286336]clssgmShutDown: Received abortive
shutdown request from client.
2010-09-20
12:02:39.895: [
CSSD][1107286336]###################################
2010-09-20
12:02:39.895: [ CSSD][1107286336]clssscExit: CSSD aborting from
thread GMClientListener
2010-09-20
12:02:39.895: [
CSSD][1107286336]###################################
To validate network,
please refer to note 1054902.1
Please also check if
the network interface name is matching the gpnp profile definition
("gpnptool get") for cluster_interconnect if CSSD could not
start after a network change.
In 11.2.0.1,
ocssd.bin may bind to public network if private network is
unavailable
4. Vendor
clusterware is up (if using vendor clusterware)
Grid Infrastructure
provide full clusterware functionality and doesn't need Vendor
clusterware to be installed; but if you happened to have Grid
Infrastructure on top of Vendor clusterware in your environment, then
Vendor clusterware need to come up fully before CRS can be started,
to verify, as grid user:
$GRID_HOME/bin/lsnodes
-n
racnode1 1
racnode1 0
If vendor
clusterware is not fully up, likely ocssd.log will have similar
messages like following:
2010-08-30
18:28:13.207: [ CSSD][36]clssnm_skgxninit: skgxncin failed, will
retry
2010-08-30
18:28:14.207: [ CSSD][36]clssnm_skgxnmon: skgxn init failed
2010-08-30
18:28:14.208: [ CSSD][36]###################################
2010-08-30
18:28:14.208: [ CSSD][36]clssscExit: CSSD signal 11 in thread
skgxnmon
Before the
clusterware is installed, execute the command below as grid user:
$INSTALL_SOURCE/install/lsnodes
-v
One issue on hp-ux:
note 2130230.1 - Grid infrastructure startup fails due to vendor
Clusterware did not start (HP-UX Service guard)
5. Command "crsctl"
being executed from wrong GRID_HOME
Command "crsctl"
must be executed from correct GRID_HOME to start the stack, or
similar message will be reported:
2012-11-14
10:21:44.014: [ CSSD][1086675264]ASSERT clssnm1.c 3248
2012-11-14
10:21:44.014: [
CSSD][1086675264](:CSSNM00056:)clssnmvStartDiscovery: Terminating
because of the release version(11.2.0.2.0) of this node being lesser
than the active version(11.2.0.3.0) that the cluster is at
2012-11-14
10:21:44.014: [
CSSD][1086675264]###################################
2012-11-14
10:21:44.014: [ CSSD][1086675264]clssscExit: CSSD aborting from
thread clssnmvDDiscThread#
Case 4: CRSD.BIN
does not start
If the "crsctl
stat res -t -init" shows that ora.crsd is in intermediate state
and if this is not the first node where crsd is starting, then a
likely cause is that the csrd.bin is not able to talk to the master
crsd.bin.
In this case, the
master crsd.bin is likely having a problem, so killing the master
crsd.bin is a likely solution.
Issue "grep
MASTER crsd.trc" to find out the node where the master crsd.bin
is running. Kill the crsd.bin on that master node.
The crsd.bin will
automatically respawn although the master will be transferred to
crsd.bin on another node.
Successful crsd.bin
startup depends on the following:
1. ocssd is fully up
If ocssd.bin is not
fully up, crsd.log will show messages like following:
2010-02-03
22:37:51.638: [ CSSCLNT][1548456880]clssscConnect: gipc request
failed with 29 (0x16)
2010-02-03
22:37:51.638: [ CSSCLNT][1548456880]clsssInitNative: connect failed,
rc 29
2010-02-03
22:37:51.639: [ CRSRTI][1548456880] CSS is not ready. Received
status 3 from CSS. Waiting for good status ..
2. OCR is accessible
If the OCR is
located on ASM, ora.asm resource (ASM instance) must be up and
diskgroup for OCR must be mounted, if not, likely the crsd.log will
show messages like:
2010-02-03
22:22:55.186: [ OCRASM][2603807664]proprasmo: Error in open/create
file in dg [GI]
[
OCRASM][2603807664]SLOS : SLOS: cat=7, opn=kgfoAl06, dep=15077,
loc=kgfokge
ORA-15077: could not
locate ASM instance serving a required diskgroup
2010-02-03
22:22:55.189: [ OCRASM][2603807664]proprasmo: kgfoCheckMount
returned [7]
2010-02-03
22:22:55.189: [ OCRASM][2603807664]proprasmo: The ASM instance is
down
2010-02-03
22:22:55.190: [ OCRRAW][2603807664]proprioo: Failed to open [+GI].
Returned proprasmo() with [26]. Marking location as UNAVAILABLE.
2010-02-03
22:22:55.190: [ OCRRAW][2603807664]proprioo: No OCR/OLR devices are
usable
2010-02-03
22:22:55.190: [ OCRASM][2603807664]proprasmcl: asmhandle is NULL
2010-02-03
22:22:55.190: [ OCRRAW][2603807664]proprinit: Could not open raw
device
2010-02-03
22:22:55.190: [ OCRASM][2603807664]proprasmcl: asmhandle is NULL
2010-02-03
22:22:55.190: [ OCRAPI][2603807664]a_init:16!: Backend init
unsuccessful : [26]
2010-02-03
22:22:55.190: [ CRSOCR][2603807664] OCR context init failure.
Error: PROC-26: Error while accessing the physical storage ASM error
[SLOS: cat=7, opn=kgfoAl06, dep=15077, loc=kgfokge
ORA-15077: could not
locate ASM instance serving a required diskgroup
] [7]
2010-02-03
22:22:55.190: [ CRSD][2603807664][PANIC] CRSD exiting: Could not
init OCR, code: 26
Note: in 11.2 ASM
starts before crsd.bin, and brings up the diskgroup automatically if
it contains the OCR.
If the OCR is
located on a non-ASM device, expected ownership and permissions are:
-rw-r----- 1 root
oinstall 272756736 Feb 3 23:24 ocr
If OCR is located on
non-ASM device and it's unavailable, likely crsd.log will show
similar message like following:
2010-02-03
23:14:33.583: [ OCROSD][2346668976]utopen:7:failed to open any OCR
file/disk, errno=2, os err string=No such file or directory
2010-02-03
23:14:33.583: [ OCRRAW][2346668976]proprinit: Could not open raw
device
2010-02-03
23:14:33.583: [ default][2346668976]a_init:7!: Backend init
unsuccessful : [26]
2010-02-03
23:14:34.587: [ OCROSD][2346668976]utopen:6m':failed in stat OCR
file/disk /share/storage/ocr, errno=2, os err string=No such file or
directory
2010-02-03
23:14:34.587: [ OCROSD][2346668976]utopen:7:failed to open any OCR
file/disk, errno=2, os err string=No such file or directory
2010-02-03
23:14:34.587: [ OCRRAW][2346668976]proprinit: Could not open raw
device
2010-02-03
23:14:34.587: [ default][2346668976]a_init:7!: Backend init
unsuccessful : [26]
2010-02-03
23:14:35.589: [ CRSD][2346668976][PANIC] CRSD exiting: OCR device
cannot be initialized, error: 1:26
If the OCR is
corrupted, likely crsd.log will show messages like the following:
2010-02-03
23:19:38.417: [ default][3360863152]a_init:7!: Backend init
unsuccessful : [26]
2010-02-03
23:19:39.429: [ OCRRAW][3360863152]propriogid:1_2: INVALID FORMAT
2010-02-03
23:19:39.429: [ OCRRAW][3360863152]proprioini: all disks are not
OCR/OLR formatted
2010-02-03
23:19:39.429: [ OCRRAW][3360863152]proprinit: Could not open raw
device
2010-02-03
23:19:39.429: [ default][3360863152]a_init:7!: Backend init
unsuccessful : [26]
2010-02-03
23:19:40.432: [ CRSD][3360863152][PANIC] CRSD exiting: OCR device
cannot be initialized, error: 1:26
If owner or group of
grid user got changed, even ASM is available, likely crsd.log will
show following:
2010-03-10
11:45:12.510: [ OCRASM][611467760]proprasmo: Error in open/create
file in dg [SYSTEMDG]
[
OCRASM][611467760]SLOS : SLOS: cat=7, opn=kgfoAl06, dep=1031,
loc=kgfokge
ORA-01031:
insufficient privileges
2010-03-10
11:45:12.528: [ OCRASM][611467760]proprasmo: kgfoCheckMount returned
[7]
2010-03-10
11:45:12.529: [ OCRASM][611467760]proprasmo: The ASM instance is
down
2010-03-10
11:45:12.529: [ OCRRAW][611467760]proprioo: Failed to open
[+SYSTEMDG]. Returned proprasmo() with [26]. Marking location as
UNAVAILABLE.
2010-03-10
11:45:12.529: [ OCRRAW][611467760]proprioo: No OCR/OLR devices are
usable
2010-03-10
11:45:12.529: [ OCRASM][611467760]proprasmcl: asmhandle is NULL
2010-03-10
11:45:12.529: [ OCRRAW][611467760]proprinit: Could not open raw
device
2010-03-10
11:45:12.529: [ OCRASM][611467760]proprasmcl: asmhandle is NULL
2010-03-10
11:45:12.529: [ OCRAPI][611467760]a_init:16!: Backend init
unsuccessful : [26]
2010-03-10
11:45:12.530: [ CRSOCR][611467760] OCR context init failure. Error:
PROC-26: Error while accessing the physical storage ASM error [SLOS:
cat=7, opn=kgfoAl06, dep=1031, loc=kgfokge
ORA-01031:
insufficient privileges
] [7]
If oracle binary in
GRID_HOME has wrong ownership or permission regardless whether ASM is
up and running, or if grid user can not write in ORACLE_BASE, likely
crsd.log will show following:
2012-03-04
21:34:23.139: [ OCRASM][3301265904]proprasmo: Error in open/create
file in dg [OCR]
[
OCRASM][3301265904]SLOS : SLOS: cat=7, opn=kgfoAl06, dep=12547,
loc=kgfokge
2012-03-04
21:34:23.139: [ OCRASM][3301265904]ASM Error Stack : ORA-12547:
TNS:lost contact
2012-03-04
21:34:23.633: [ OCRASM][3301265904]proprasmo: kgfoCheckMount
returned [7]
2012-03-04
21:34:23.633: [ OCRASM][3301265904]proprasmo: The ASM instance is
down
2012-03-04
21:34:23.634: [ OCRRAW][3301265904]proprioo: Failed to open [+OCR].
Returned proprasmo() with [26]. Marking location as UNAVAILABLE.
2012-03-04
21:34:23.634: [ OCRRAW][3301265904]proprioo: No OCR/OLR devices are
usable
2012-03-04
21:34:23.635: [ OCRASM][3301265904]proprasmcl: asmhandle is NULL
2012-03-04
21:34:23.636: [ GIPC][3301265904] gipcCheckInitialization:
possible incompatible non-threaded init from [prom.c : 690], original
from [clsss.c : 5326]
2012-03-04
21:34:23.639: [ default][3301265904]clsvactversion:4: Retrieving
Active Version from local storage.
2012-03-04
21:34:23.643: [ OCRRAW][3301265904]proprrepauto: The local OCR
configuration matches with the configuration published by OCR Cache
Writer. No repair required.
2012-03-04
21:34:23.645: [ OCRRAW][3301265904]proprinit: Could not open raw
device
2012-03-04
21:34:23.646: [ OCRASM][3301265904]proprasmcl: asmhandle is NULL
2012-03-04
21:34:23.650: [ OCRAPI][3301265904]a_init:16!: Backend init
unsuccessful : [26]
2012-03-04
21:34:23.651: [ CRSOCR][3301265904] OCR context init failure.
Error: PROC-26: Error while accessing the physical storage
ORA-12547: TNS:lost
contact
2012-03-04
21:34:23.652: [ CRSMAIN][3301265904] Created alert : (:CRSD00111:) :
Could not init OCR, error: PROC-26: Error while accessing the
physical storage
ORA-12547: TNS:lost
contact
2012-03-04
21:34:23.652: [ CRSD][3301265904][PANIC] CRSD exiting: Could not
init OCR, code: 26
The expected
ownership and permission of oracle binary in GRID_HOME should be:
-rwsr-s--x 1 grid
oinstall 184431149 Feb 2 20:37 /ocw/grid/bin/oracle
If OCR or mirror is
unavailable (could be ASM is up, but diskgroup for OCR/mirror is
unmounted), likely crsd.log will show following:
2010-05-11
11:16:38.578: [ OCRASM][18]proprasmo: Error in open/create file in
dg [OCRMIR]
[ OCRASM][18]SLOS :
SLOS: cat=8, opn=kgfoOpenFile01, dep=15056, loc=kgfokge
ORA-17503:
ksfdopn:DGOpenFile05 Failed to open file +OCRMIR.255.4294967295
ORA-17503: ksfdopn:2
Failed to open file +OCRMIR.255.4294967295
ORA-15001: diskgroup
"OCRMIR
..
2010-05-11
11:16:38.647: [ OCRASM][18]proprasmo: kgfoCheckMount returned [6]
2010-05-11
11:16:38.648: [ OCRASM][18]proprasmo: The ASM disk group OCRMIR is
not found or not mounted
2010-05-11
11:16:38.648: [ OCRASM][18]proprasmdvch: Failed to open OCR location
[+OCRMIR] error [26]
2010-05-11
11:16:38.648: [ OCRRAW][18]propriodvch: Error [8] returned device
check for [+OCRMIR]
2010-05-11
11:16:38.648: [ OCRRAW][18]dev_replace: non-master could not verify
the new disk (8)
[
OCRSRV][18]proath_invalidate_action: Failed to replace [+OCRMIR] [8]
[
OCRAPI][18]procr_ctx_set_invalid_no_abort: ctx set to invalid
..
2010-05-11
11:16:46.587: [ OCRMAS][19]th_master:91: Comparing device hash ids
between local and master failed
2010-05-11
11:16:46.587: [ OCRMAS][19]th_master:91 Local dev (1862408427,
1028247821, 0, 0, 0)
2010-05-11
11:16:46.587: [ OCRMAS][19]th_master:91 Master dev (1862408427,
1859478705, 0, 0, 0)
2010-05-11
11:16:46.587: [ OCRMAS][19]th_master:9: Shutdown CacheLocal. my hash
ids don't match
[
OCRAPI][19]procr_ctx_set_invalid_no_abort: ctx set to invalid
[
OCRAPI][19]procr_ctx_set_invalid: aborting...
2010-05-11
11:16:46.587: [ CRSD][19] Dump State Starting ...
3. crsd.bin pid file
exists and points to running crsd.bin process
If pid file does not
exist,
$GRID_HOME/log/$HOST/agent/ohasd/orarootagent_root/orarootagent_root.log
will have similar like the following:
2010-02-14
17:40:57.927: [ora.crsd][1243486528] [check] PID FILE doesn't exist.
..
2010-02-14
17:41:57.927: [ clsdmt][1092499776]Creating PID [30269] file for
home /ocw/grid host racnode1 bin crs to /ocw/grid/crs/init/
2010-02-14
17:41:57.927: [ clsdmt][1092499776]Error3 -2 writing PID [30269] to
the file []
2010-02-14
17:41:57.927: [ clsdmt][1092499776]Failed to record pid for CRSD
2010-02-14
17:41:57.927: [ clsdmt][1092499776]Terminating process
2010-02-14
17:41:57.927: [ default][1092499776] CRSD exiting on stop request
from clsdms_thdmai
The solution is to
create a dummy pid file ($GRID_HOME/crs/init/$HOST.pid) manually as
grid user with "touch" command and restart resource
ora.crsd
If pid file does
exist and the PID in this file references a running process which is
NOT the crsd.bin process,
$GRID_HOME/log/$HOST/agent/ohasd/orarootagent_root/orarootagent_root.log
will have similar like the following:
2011-04-06
15:53:38.777: [ora.crsd][1160390976] [check] PID will be looked for
in /ocw/grid/crs/init/racnode1.pid
2011-04-06
15:53:38.778: [ora.crsd][1160390976] [check] PID which will be
monitored will be 1535 >> 1535 is
output of "cat /ocw/grid/crs/init/racnode1.pid"
2011-04-06
15:53:38.965: [ COMMCRS][1191860544]clsc_connect: (0x2aaab400b0b0) no
listener at (ADDRESS=(PROTOCOL=ipc)(KEY=racnode1DBG_CRSD))
[
clsdmc][1160390976]Fail to connect
(ADDRESS=(PROTOCOL=ipc)(KEY=racnode1DBG_CRSD)) with status 9
2011-04-06
15:53:38.966: [ora.crsd][1160390976] [check] Error = error 9
encountered when connecting to CRSD
2011-04-06
15:53:39.023: [ora.crsd][1160390976] [check] Calling PID check for
daemon
2011-04-06
15:53:39.023: [ora.crsd][1160390976] [check] Trying to check PID =
1535
2011-04-06
15:53:39.203: [ora.crsd][1160390976] [check] PID check returned
ONLINE CLSDM returned OFFLINE
2011-04-06
15:53:39.203: [ora.crsd][1160390976] [check] DaemonAgent::check
returned 5
2011-04-06
15:53:39.203: [ AGFW][1160390976] check for resource: ora.crsd 1 1
completed with status: FAILED
2011-04-06
15:53:39.203: [ AGFW][1170880832] ora.crsd 1 1 state changed from:
UNKNOWN to: FAILED
..
2011-04-06
15:54:10.511: [ AGFW][1167522112] ora.crsd 1 1 state changed from:
UNKNOWN to: CLEANING
..
2011-04-06
15:54:10.513: [ora.crsd][1146542400] [clean] Trying to stop PID =
1535
..
2011-04-06
15:54:11.514: [ora.crsd][1146542400] [clean] Trying to check PID =
1535
To verify on OS
level:
ls -l
/ocw/grid/crs/init/*pid
-rwxr-xr-x 1 ogrid
oinstall 5 Feb 17 11:00 /ocw/grid/crs/init/racnode1.pid
cat
/ocw/grid/crs/init/*pid
1535
ps -ef| grep 1535
root 1535 1
0 Mar30 ? 00:00:00 iscsid >> Note
process 1535 is not crsd.bin
The solution is to
create an empty pid file and to restart the resource ora.crsd, as
root:
# >
$GRID_HOME/crs/init/<racnode1>.pid
#
$GRID_HOME/bin/crsctl stop res ora.crsd -init
#
$GRID_HOME/bin/crsctl start res ora.crsd -init
4. Network is
functional and name resolution is working:
If the network is
not fully functioning, ocssd.bin may still come up, but crsd.bin may
fail and the crsd.log will show messages like:
2010-02-03
23:34:28.412: [ GPnP][2235814832]clsgpnp_Init: [at clsgpnp0.c:837]
GPnP client pid=867, tl=3, f=0
2010-02-03
23:34:28.428: [ OCRAPI][2235814832]clsu_get_private_ip_addresses: no
ip addresses found.
..
2010-02-03
23:34:28.434: [ OCRAPI][2235814832]a_init:13!: Clusterware init
unsuccessful : [44]
2010-02-03
23:34:28.434: [ CRSOCR][2235814832] OCR context init failure.
Error: PROC-44: Error in network address and interface operations
Network address and interface operations error [7]
2010-02-03
23:34:28.434: [ CRSD][2235814832][PANIC] CRSD exiting: Could not
init OCR, code: 44
Or:
2009-12-10
06:28:31.974: [ OCRMAS][20]proath_connect_master:1: could not
connect to master clsc_ret1 = 9, clsc_ret2 = 9
2009-12-10
06:28:31.974: [ OCRMAS][20]th_master:11: Could not connect to the
new master
2009-12-10
06:29:01.450: [ CRSMAIN][2] Policy Engine is not initialized yet!
2009-12-10
06:29:31.489: [ CRSMAIN][2] Policy Engine is not initialized yet!
Or:
2009-12-31
00:42:08.110: [ COMMCRS][10]clsc_receive: (102b03250) Error
receiving, ns (12535, 12560), transport (505, 145, 0)
To validate the
network, please refer to note 1054902.1
5. crsd executable
(crsd.bin and crsd in GRID_HOME/bin) has correct ownership/permission
and hasn't been manually modified, a simply way to check is to
compare output of "ls -l <grid-home>/bin/crsd
<grid-home>/bin/crsd.bin" with a "good" node.
6. crsd may not
start due to the following:
note 1552472.1 -CRSD
Will Not Start Following a Node Reboot: crsd.log reports: clsclisten:
op 65 failed and/or Unable to get E2E port
note 1684332.1 - GI
crsd Fails to Start: clsclisten: op 65 failed, NSerr (12560, 0),
transport: (583, 0, 0)
7. To troubleshoot
further, refer to note 1323698.1 - Troubleshooting CRSD Start up
Issue
Case 5: GPNPD.BIN
does not start
1. Name Resolution
is not working
gpnpd.bin fails with
following error in gpnpd.log:
2010-05-13
12:48:11.540: [ GPnP][1171126592]clsgpnpm_exchange: [at
clsgpnpm.c:1175] Calling "tcp://node2:9393", try 1 of 3...
2010-05-13
12:48:11.540: [ GPnP][1171126592]clsgpnpm_connect: [at
clsgpnpm.c:1015] ENTRY
2010-05-13
12:48:11.541: [ GPnP][1171126592]clsgpnpm_connect: [at
clsgpnpm.c:1066] GIPC gipcretFail (1)
gipcConnect(tcp-tcp://node2:9393)
2010-05-13
12:48:11.541: [ GPnP][1171126592]clsgpnpm_connect: [at
clsgpnpm.c:1067] Result: (48) CLSGPNP_COMM_ERR. Failed to connect to
call url "tcp://node2:9393"
In above example,
please make sure current node is able to ping "node2", and
no firewall between them.
2. Bug 10105195
Due to Bug 10105195,
gpnp dispatch is single threaded and could be blocked by network
scanning etc, the bug is fixed in 11.2.0.2 GI PSU2, 11.2.0.3 and
above, refer to note 10105195.8 for more details.
Case 6: Various
other daemons do not start
Common causes:
1. Log file or
directory for the daemon doesn't have appropriate ownership or
permission
If the log file or
log directory for the daemon doesn't have proper ownership or
permissions, usually there is no new info in the log file and the
timestamp remains the same while the daemon tries to come up.
Refer to below
section "Log File Location, Ownership and Permission" for
general reference.
2. Network socket
file doesn't have appropriate ownership or permission
In this case, the
daemon log will show messages like:
2010-02-02
12:55:20.485: [ COMMCRS][1121433920]clsclisten: Permission denied for
(ADDRESS=(PROTOCOL=ipc)(KEY=rac1DBG_GIPCD))
2010-02-02
12:55:20.485: [ clsdmt][1110944064]Fail to listen to
(ADDRESS=(PROTOCOL=ipc)(KEY=rac1DBG_GIPCD))
3. OLR is corrupted
In this case, the
daemon log will show messages like (this is a case that ora.ctssd
fails to start):
2012-07-22
00:15:16.565: [ default][1]clsvactversion:4: Retrieving Active
Version from local storage.
2012-07-22
00:15:16.575: [ CTSS][1]clsctss_r_av3: Invalid active version []
retrieved from OLR. Returns [19].
2012-07-22
00:15:16.585: [ CTSS][1](:ctss_init16:): Error [19] retrieving
active version. Returns [19].
2012-07-22
00:15:16.585: [ CTSS][1]ctss_main: CTSS init failed [19]
2012-07-22
00:15:16.585: [ CTSS][1]ctss_main: CTSS daemon aborting [19].
2012-07-22
00:15:16.585: [ CTSS][1]CTSS daemon aborting
The solution is to
restore a good copy of OLR note 1193643.1
4. Other cases:
note 1087521.1 -
CTSS Daemon Aborting With "op 65 failed, NSerr (12560, 0),
transport: (583, 0, 0)"
Case 7: CRSD Agents
do not start
CRSD.BIN will spawn
two agents to start up user resource -the two agent share same name
and binary as ohasd.bin agents:
orarootagent:
responsible for ora.netn.network, ora.nodename.vip, ora.scann.vip and
ora.gns
oraagent:
responsible for ora.asm, ora.eons, ora.ons, listener, SCAN listener,
diskgroup, database, service resource etc
To find out the user
resource status:
$GRID_HOME/crsctl
stat res -t
If crsd.bin can not
start any of the above agents properly, user resources may not come
up.
1. Common cause of
agent failure is that the log file or log directory for the agents
don't have proper ownership or permissions.
Refer to below
section "Log File Location, Ownership and Permission" for
general reference.
2. Agent may fail to
start due to bug 11834289 with error "CRS-5802: Unable to start
the agent process", refer to Section "OHASD does not start"
#10 for details.
Case 8: HAIP does
not start
HAIP may fail to
start with various errors, i.e.
[ohasd(891)]CRS-2807:Resource
'ora.cluster_interconnect.haip' failed to start automatically.
Refer to note
1210883.1 for more details of HAIP
Network and Naming
Resolution Verification
CRS depends on a
fully functional network and name resolution. If the network or name
resolution is not fully functioning, CRS may not come up
successfully.
To validate network
and name resolution setup, please refer to note 1054902.1
Log File Location,
Ownership and Permission
Appropriate
ownership and permission of sub-directories and files in
$GRID_HOME/log is critical for CRS components to come up properly.
In Grid
Infrastructure cluster environment:
Assuming a Grid
Infrastructure environment with node name rac1, CRS owner grid, and
two separate RDBMS owner rdbmsap and rdbmsar, here's what it looks
like under $GRID_HOME/log in cluster environment:
drwxrwxr-x 5
grid oinstall 4096 Dec 6 09:20 log
drwxr-xr-x 2
grid oinstall 4096 Dec 6 08:36 crs
drwxr-xr-t 17
root oinstall 4096 Dec 6 09:22 rac1
drwxr-x--- 2
grid oinstall 4096 Dec 6 09:20 admin
drwxrwxr-t 4
root oinstall 4096 Dec 6 09:20 agent
drwxrwxrwt
7 root oinstall 4096 Jan 26 18:15 crsd
drwxr-xr-t 2 grid oinstall 4096 Dec 6 09:40 application_grid
drwxr-xr-t 2 grid oinstall 4096 Jan 26 18:15 oraagent_grid
drwxr-xr-t 2 rdbmsap oinstall 4096 Jan 26 18:15 oraagent_rdbmsap
drwxr-xr-t 2 rdbmsar oinstall 4096 Jan 26 18:15 oraagent_rdbmsar
drwxr-xr-t 2 grid oinstall 4096 Jan 26 18:15 ora_oc4j_type_grid
drwxr-xr-t 2 root root 4096 Jan 26 20:09 orarootagent_root
drwxrwxr-t
6 root oinstall 4096 Dec 6 09:24 ohasd
drwxr-xr-t 2 grid oinstall 4096 Jan 26 18:14 oraagent_grid
drwxr-xr-t 2 root root 4096 Dec 6 09:24 oracssdagent_root
drwxr-xr-t 2 root root 4096 Dec 6 09:24 oracssdmonitor_root
drwxr-xr-t 2 root root 4096 Jan 26 18:14 orarootagent_root
-rw-rw-r-- 1
root root 12931 Jan 26 21:30 alertrac1.log
drwxr-x--- 2
grid oinstall 4096 Jan 26 20:44 client
drwxr-x--- 2
root oinstall 4096 Dec 6 09:24 crsd
drwxr-x--- 2
grid oinstall 4096 Dec 6 09:24 cssd
drwxr-x--- 2
root oinstall 4096 Dec 6 09:24 ctssd
drwxr-x--- 2
grid oinstall 4096 Jan 26 18:14 diskmon
drwxr-x--- 2
grid oinstall 4096 Dec 6 09:25 evmd
drwxr-x--- 2
grid oinstall 4096 Jan 26 21:20 gipcd
drwxr-x--- 2
root oinstall 4096 Dec 6 09:20 gnsd
drwxr-x--- 2
grid oinstall 4096 Jan 26 20:58 gpnpd
drwxr-x--- 2
grid oinstall 4096 Jan 26 21:19 mdnsd
drwxr-x--- 2
root oinstall 4096 Jan 26 21:20 ohasd
drwxrwxr-t 5
grid oinstall 4096 Dec 6 09:34 racg
drwxrwxrwt
2 grid oinstall 4096 Dec 6 09:20 racgeut
drwxrwxrwt
2 grid oinstall 4096 Dec 6 09:20 racgevtf
drwxrwxrwt
2 grid oinstall 4096 Dec 6 09:20 racgmain
drwxr-x--- 2
grid oinstall 4096 Jan 26 20:57 srvm
Please note most log
files in sub-directory inherit ownership of parent directory; and
above are just for general reference to tell whether there's
unexpected recursive ownership and permission changes inside the CRS
home . If you have a working node with the same version, the working
node should be used as a reference.
In Oracle Restart
environment:
And here's what it
looks like under $GRID_HOME/log in Oracle Restart environment:
drwxrwxr-x 5
grid oinstall 4096 Oct 31 2009 log
drwxr-xr-x 2
grid oinstall 4096 Oct 31 2009 crs
drwxr-xr-x 3
grid oinstall 4096 Oct 31 2009 diag
drwxr-xr-t 17
root oinstall 4096 Oct 31 2009 rac1
drwxr-x--- 2
grid oinstall 4096 Oct 31 2009 admin
drwxrwxr-t 4
root oinstall 4096 Oct 31 2009 agent
drwxrwxrwt
2 root oinstall 4096 Oct 31 2009 crsd
drwxrwxr-t
8 root oinstall 4096 Jul 14 08:15 ohasd
drwxr-xr-x 2 grid oinstall 4096 Aug 5 13:40 oraagent_grid
drwxr-xr-x 2 grid oinstall 4096 Aug 2 07:11 oracssdagent_grid
drwxr-xr-x 2 grid oinstall 4096 Aug 3 21:13 orarootagent_grid
-rwxr-xr-x 1
grid oinstall 13782 Aug 1 17:23 alertrac1.log
drwxr-x--- 2
grid oinstall 4096 Nov 2 2009 client
drwxr-x--- 2
root oinstall 4096 Oct 31 2009 crsd
drwxr-x--- 2
grid oinstall 4096 Oct 31 2009 cssd
drwxr-x--- 2
root oinstall 4096 Oct 31 2009 ctssd
drwxr-x--- 2
grid oinstall 4096 Oct 31 2009 diskmon
drwxr-x--- 2
grid oinstall 4096 Oct 31 2009 evmd
drwxr-x--- 2
grid oinstall 4096 Oct 31 2009 gipcd
drwxr-x--- 2
root oinstall 4096 Oct 31 2009 gnsd
drwxr-x--- 2
grid oinstall 4096 Oct 31 2009 gpnpd
drwxr-x--- 2
grid oinstall 4096 Oct 31 2009 mdnsd
drwxr-x--- 2
grid oinstall 4096 Oct 31 2009 ohasd
drwxrwxr-t 5
grid oinstall 4096 Oct 31 2009 racg
drwxrwxrwt
2 grid oinstall 4096 Oct 31 2009 racgeut
drwxrwxrwt
2 grid oinstall 4096 Oct 31 2009 racgevtf
drwxrwxrwt
2 grid oinstall 4096 Oct 31 2009 racgmain
drwxr-x--- 2
grid oinstall 4096 Oct 31 2009 srvm
For 12.1.0.2 onward,
refer to note 1915729.1 - Oracle Clusterware Diagnostic and Alert Log
Moved to ADR
Network Socket File
Location, Ownership and Permission
Network socket files
can be located in /tmp/.oracle, /var/tmp/.oracle or /usr/tmp/.oracle
When socket file has
unexpected ownership or permission, usually daemon log file (i.e.
evmd.log) will have the following:
2011-06-18
14:07:28.545: [ COMMCRS][772]clsclisten: Permission denied for
(ADDRESS=(PROTOCOL=ipc)(KEY=racnode1DBG_EVMD))
2011-06-18
14:07:28.545: [ clsdmt][515]Fail to listen to
(ADDRESS=(PROTOCOL=ipc)(KEY=lexxxDBG_EVMD))
2011-06-18
14:07:28.545: [ clsdmt][515]Terminating process
2011-06-18
14:07:28.559: [ default][515] EVMD exiting on stop request from
clsdms_thdmai
And the following
error may be reported:
CRS-5017: The
resource action "ora.evmd start" encountered the following
error:
CRS-2674: Start of
'ora.evmd' on 'racnode1' failed
..
The solution is to
stop GI as root (crsctl stop crs -f), clean up socket files and
restart GI.
Assuming a Grid
Infrastructure environment with node name rac1, CRS owner grid, and
clustername eotcs
In Grid
Infrastructure cluster environment:
Below is an example
output from cluster environment:
drwxrwxrwt 2 root
oinstall 4096 Feb 2 21:25 .oracle
./.oracle:
drwxrwxrwt 2 root
oinstall 4096 Feb 2 21:25 .
srwxrwx--- 1 grid
oinstall 0 Feb 2 18:00 master_diskmon
srwxrwxrwx 1 grid
oinstall 0 Feb 2 18:00 mdnsd
-rw-r--r-- 1 grid
oinstall 5 Feb 2 18:00 mdnsd.pid
prw-r--r-- 1 root
root 0 Feb 2 13:33 npohasd
srwxrwxrwx 1 grid
oinstall 0 Feb 2 18:00 ora_gipc_GPNPD_rac1
-rw-r--r-- 1 grid
oinstall 0 Feb 2 13:34 ora_gipc_GPNPD_rac1_lock
srwxrwxrwx 1 grid
oinstall 0 Feb 2 13:39 s#11724.1
srwxrwxrwx 1 grid
oinstall 0 Feb 2 13:39 s#11724.2
srwxrwxrwx 1 grid
oinstall 0 Feb 2 13:39 s#11735.1
srwxrwxrwx 1 grid
oinstall 0 Feb 2 13:39 s#11735.2
srwxrwxrwx 1 grid
oinstall 0 Feb 2 13:45 s#12339.1
srwxrwxrwx 1 grid
oinstall 0 Feb 2 13:45 s#12339.2
srwxrwxrwx 1 grid
oinstall 0 Feb 2 18:01 s#6275.1
srwxrwxrwx 1 grid
oinstall 0 Feb 2 18:01 s#6275.2
srwxrwxrwx 1 grid
oinstall 0 Feb 2 18:01 s#6276.1
srwxrwxrwx 1 grid
oinstall 0 Feb 2 18:01 s#6276.2
srwxrwxrwx 1 grid
oinstall 0 Feb 2 18:01 s#6278.1
srwxrwxrwx 1 grid
oinstall 0 Feb 2 18:01 s#6278.2
srwxrwxrwx 1 grid
oinstall 0 Feb 2 18:00 sAevm
srwxrwxrwx 1 grid
oinstall 0 Feb 2 18:00 sCevm
srwxrwxrwx 1 root
root 0 Feb 2 18:01 sCRSD_IPC_SOCKET_11
srwxrwxrwx 1 root
root 0 Feb 2 18:01 sCRSD_UI_SOCKET
srwxrwxrwx 1 root
root 0 Feb 2 21:25 srac1DBG_CRSD
srwxrwxrwx 1 grid
oinstall 0 Feb 2 18:00 srac1DBG_CSSD
srwxrwxrwx 1 root
root 0 Feb 2 18:00 srac1DBG_CTSSD
srwxrwxrwx 1 grid
oinstall 0 Feb 2 18:00 srac1DBG_EVMD
srwxrwxrwx 1 grid
oinstall 0 Feb 2 18:00 srac1DBG_GIPCD
srwxrwxrwx 1 grid
oinstall 0 Feb 2 18:00 srac1DBG_GPNPD
srwxrwxrwx 1 grid
oinstall 0 Feb 2 18:00 srac1DBG_MDNSD
srwxrwxrwx 1 root
root 0 Feb 2 18:00 srac1DBG_OHASD
srwxrwxrwx 1 grid
oinstall 0 Feb 2 18:01 sLISTENER
srwxrwxrwx 1 grid
oinstall 0 Feb 2 18:01 sLISTENER_SCAN2
srwxrwxrwx 1 grid
oinstall 0 Feb 2 18:01 sLISTENER_SCAN3
srwxrwxrwx 1 grid
oinstall 0 Feb 2 18:00 sOCSSD_LL_rac1_
srwxrwxrwx 1 grid
oinstall 0 Feb 2 18:00 sOCSSD_LL_rac1_eotcs
-rw-r--r-- 1 grid
oinstall 0 Feb 2 18:00 sOCSSD_LL_rac1_eotcs_lock
-rw-r--r-- 1 grid
oinstall 0 Feb 2 18:00 sOCSSD_LL_rac1__lock
srwxrwxrwx 1 root
root 0 Feb 2 18:00 sOHASD_IPC_SOCKET_11
srwxrwxrwx 1 root
root 0 Feb 2 18:00 sOHASD_UI_SOCKET
srwxrwxrwx 1 grid
oinstall 0 Feb 2 18:00 sOracle_CSS_LclLstnr_eotcs_1
-rw-r--r-- 1 grid
oinstall 0 Feb 2 18:00 sOracle_CSS_LclLstnr_eotcs_1_lock
srwxrwxrwx 1 root
root 0 Feb 2 18:01 sora_crsqs
srwxrwxrwx 1 root
root 0 Feb 2 18:00 sprocr_local_conn_0_PROC
srwxrwxrwx 1 root
root 0 Feb 2 18:00 sprocr_local_conn_0_PROL
srwxrwxrwx 1 grid
oinstall 0 Feb 2 18:00 sSYSTEM.evm.acceptor.auth
In Oracle Restart
environment:
And below is an
example output from Oracle Restart environment:
drwxrwxrwt 2 root
oinstall 4096 Feb 2 21:25 .oracle
./.oracle:
srwxrwx--- 1 grid
oinstall 0 Aug 1 17:23 master_diskmon
prw-r--r-- 1 grid
oinstall 0 Oct 31 2009 npohasd
srwxrwxrwx 1 grid
oinstall 0 Aug 1 17:23 s#14478.1
srwxrwxrwx 1 grid
oinstall 0 Aug 1 17:23 s#14478.2
srwxrwxrwx 1 grid
oinstall 0 Jul 14 08:02 s#2266.1
srwxrwxrwx 1 grid
oinstall 0 Jul 14 08:02 s#2266.2
srwxrwxrwx 1 grid
oinstall 0 Jul 7 10:59 s#2269.1
srwxrwxrwx 1 grid
oinstall 0 Jul 7 10:59 s#2269.2
srwxrwxrwx 1 grid
oinstall 0 Jul 31 22:10 s#2313.1
srwxrwxrwx 1 grid
oinstall 0 Jul 31 22:10 s#2313.2
srwxrwxrwx 1 grid
oinstall 0 Jun 29 21:58 s#2851.1
srwxrwxrwx 1 grid
oinstall 0 Jun 29 21:58 s#2851.2
srwxrwxrwx 1 grid
oinstall 0 Aug 1 17:23 sCRSD_UI_SOCKET
srwxrwxrwx 1 grid
oinstall 0 Aug 1 17:23 srac1DBG_CSSD
srwxrwxrwx 1 grid
oinstall 0 Aug 1 17:23 srac1DBG_OHASD
srwxrwxrwx 1 grid
oinstall 0 Aug 1 17:23 sEXTPROC1521
srwxrwxrwx 1 grid
oinstall 0 Aug 1 17:23 sOCSSD_LL_rac1_
srwxrwxrwx 1 grid
oinstall 0 Aug 1 17:23 sOCSSD_LL_rac1_localhost
-rw-r--r-- 1 grid
oinstall 0 Aug 1 17:23 sOCSSD_LL_rac1_localhost_lock
-rw-r--r-- 1 grid
oinstall 0 Aug 1 17:23 sOCSSD_LL_rac1__lock
srwxrwxrwx 1 grid
oinstall 0 Aug 1 17:23 sOHASD_IPC_SOCKET_11
srwxrwxrwx 1 grid
oinstall 0 Aug 1 17:23 sOHASD_UI_SOCKET
srwxrwxrwx 1 grid
oinstall 0 Aug 1 17:23 sgrid_CSS_LclLstnr_localhost_1
-rw-r--r-- 1 grid
oinstall 0 Aug 1 17:23 sgrid_CSS_LclLstnr_localhost_1_lock
srwxrwxrwx 1 grid
oinstall 0 Aug 1 17:23 sprocr_local_conn_0_PROL
Diagnostic file
collection
If the issue can't
be identified with the note, as root, please run
$GRID_HOME/bin/diagcollection.sh on all nodes, and upload all .gz
files it generated in current directory
f