From 85649af65c0f1e3068d68633b3e983994710eea2 Mon Sep 17 00:00:00 2001 From: btkcodedev Date: Mon, 10 Nov 2025 18:31:33 +0530 Subject: [PATCH 1/5] Add Hbase plugin --- .../src/app/ingest/source/conf/hbase/hbase.ts | 42 ++ .../src/app/ingest/source/conf/sources.tsx | 2 + .../app/ingestV2/source/conf/hbase/hbase.ts | 42 ++ .../src/app/ingestV2/source/conf/sources.tsx | 2 + datahub-web-react/src/images/hbaselogo.png | Bin 0 -> 21196 bytes metadata-ingestion/setup.py | 1 + .../src/datahub/ingestion/source/sql/hbase.py | 540 ++++++++++++++++++ 7 files changed, 629 insertions(+) create mode 100644 datahub-web-react/src/app/ingest/source/conf/hbase/hbase.ts create mode 100644 datahub-web-react/src/app/ingestV2/source/conf/hbase/hbase.ts create mode 100644 datahub-web-react/src/images/hbaselogo.png create mode 100644 metadata-ingestion/src/datahub/ingestion/source/sql/hbase.py diff --git a/datahub-web-react/src/app/ingest/source/conf/hbase/hbase.ts b/datahub-web-react/src/app/ingest/source/conf/hbase/hbase.ts new file mode 100644 index 00000000000000..b563b31e8ea1ea --- /dev/null +++ b/datahub-web-react/src/app/ingest/source/conf/hbase/hbase.ts @@ -0,0 +1,42 @@ +import { SourceConfig } from '@app/ingest/source/conf/types'; + +import hbaseLogo from '@images/hbaselogo.png'; + +const placeholderRecipe = `\ +source: + type: hbase + config: + # Coordinates + host: # Your HBase Thrift server host, e.g. localhost + port: 9090 # Your HBase Thrift server port (default: 9090) + + # Optional: Filter patterns + namespace_pattern: + allow: + - ".*" # Allow all namespaces + table_pattern: + allow: + - ".*" # Allow all tables + + # Optional: Authentication + # auth_mechanism: # Authentication mechanism (e.g., KERBEROS) + + # Optional: Schema extraction + include_column_families: true # Include column families in schema metadata + max_column_qualifiers: 100 # Maximum column qualifiers to sample + + stateful_ingestion: + enabled: true +`; + +export const HBASE = 'hbase'; + +const hbaseConfig: SourceConfig = { + type: HBASE, + placeholderRecipe, + displayName: 'HBase', + docsUrl: 'https://docs.datahub.com/docs/generated/ingestion/sources/hbase/', + logoUrl: hbaseLogo, +}; + +export default hbaseConfig; diff --git a/datahub-web-react/src/app/ingest/source/conf/sources.tsx b/datahub-web-react/src/app/ingest/source/conf/sources.tsx index 2b362255f8abc0..40c25f6997411f 100644 --- a/datahub-web-react/src/app/ingest/source/conf/sources.tsx +++ b/datahub-web-react/src/app/ingest/source/conf/sources.tsx @@ -6,6 +6,7 @@ import azureAdConfig from '@app/ingest/source/conf/azure/azure'; import bigqueryConfig from '@app/ingest/source/conf/bigquery/bigquery'; import csvConfig from '@app/ingest/source/conf/csv/csv'; import glueConfig from '@app/ingest/source/conf/glue/glue'; +import hbaseConfig from '@app/ingest/source/conf/hbase/hbase'; import hiveConfig from '@app/ingest/source/conf/hive/hive'; import kafkaConfig from '@app/ingest/source/conf/kafka/kafka'; import lookerConfig from '@app/ingest/source/conf/looker/looker'; @@ -49,6 +50,7 @@ export const SOURCE_TEMPLATE_CONFIGS: Array = [ oktaConfig, glueConfig, oracleConfig, + hbaseConfig, hiveConfig, csvConfig, sacConfig, diff --git a/datahub-web-react/src/app/ingestV2/source/conf/hbase/hbase.ts b/datahub-web-react/src/app/ingestV2/source/conf/hbase/hbase.ts new file mode 100644 index 00000000000000..a15431904512cc --- /dev/null +++ b/datahub-web-react/src/app/ingestV2/source/conf/hbase/hbase.ts @@ -0,0 +1,42 @@ +import { SourceConfig } from '@app/ingestV2/source/conf/types'; + +import hbaseLogo from '@images/hbaselogo.png'; + +const placeholderRecipe = `\ +source: + type: hbase + config: + # Coordinates + host: # Your HBase Thrift server host, e.g. localhost + port: 9090 # Your HBase Thrift server port (default: 9090) + + # Optional: Filter patterns + namespace_pattern: + allow: + - ".*" # Allow all namespaces + table_pattern: + allow: + - ".*" # Allow all tables + + # Optional: Authentication + # auth_mechanism: # Authentication mechanism (e.g., KERBEROS) + + # Optional: Schema extraction + include_column_families: true # Include column families in schema metadata + max_column_qualifiers: 100 # Maximum column qualifiers to sample + + stateful_ingestion: + enabled: true +`; + +export const HBASE = 'hbase'; + +const hbaseConfig: SourceConfig = { + type: HBASE, + placeholderRecipe, + displayName: 'HBase', + docsUrl: 'https://docs.datahub.com/docs/generated/ingestion/sources/hbase/', + logoUrl: hbaseLogo, +}; + +export default hbaseConfig; diff --git a/datahub-web-react/src/app/ingestV2/source/conf/sources.tsx b/datahub-web-react/src/app/ingestV2/source/conf/sources.tsx index 353c483ba103de..1c2a4e4e9b50d4 100644 --- a/datahub-web-react/src/app/ingestV2/source/conf/sources.tsx +++ b/datahub-web-react/src/app/ingestV2/source/conf/sources.tsx @@ -6,6 +6,7 @@ import azureAdConfig from '@app/ingestV2/source/conf/azure/azure'; import bigqueryConfig from '@app/ingestV2/source/conf/bigquery/bigquery'; import csvConfig from '@app/ingestV2/source/conf/csv/csv'; import glueConfig from '@app/ingestV2/source/conf/glue/glue'; +import hbaseConfig from '@app/ingestV2/source/conf/hbase/hbase'; import hiveConfig from '@app/ingestV2/source/conf/hive/hive'; import kafkaConfig from '@app/ingestV2/source/conf/kafka/kafka'; import lookerConfig from '@app/ingestV2/source/conf/looker/looker'; @@ -48,6 +49,7 @@ export const SOURCE_TEMPLATE_CONFIGS: Array = [ oktaConfig, glueConfig, oracleConfig, + hbaseConfig, hiveConfig, csvConfig, sacConfig, diff --git a/datahub-web-react/src/images/hbaselogo.png b/datahub-web-react/src/images/hbaselogo.png new file mode 100644 index 0000000000000000000000000000000000000000..e91eb8d7401ad07cf9a7ac7e062200e6eb488929 GIT binary patch literal 21196 zcmZ^~19)ED)-D{|wv)!TZQFK(C$^n5NgLa?ZL4t_qhVuP|I>HxbH07{_5GPwlC|a< z_Zo9d4y`C#5JVYiaa9ly&|ko7PZ%iRUnQRf0Pr8Ei>j0;NX<0CG4KW! zAg$v9Bwqge3o1&T=?MY?L1Ck=?W(OH&uivr&uC)qXllXeY5x}r0>baf3%sTAUiHzhQ64$Q+WZDYKBw~)v79<>u?2OE0f^Z}xB>c|imb|Ls zlK+MS?*z!KU0ngZOiUgg9*iDrjE>G$Oe{P+JWR~2OsuR7KneyIF9%l>PX-4U@_)VL z|GY=s!o|$l2Hgd~{97!bfJ}d%FtIQ) zGyQKc3s0N>AK2d~|GzMEv;Qj@6HzU4S^2|CJrEZ~rgw|7oPB z3E*GM{^`Vj$;SV;<-DpEE{=9?f4e}#!NygPmH!`s{=52rdgNayF-LoVvxSQb5GKg+ zZ^(bv{Wrb#|F99{Vg5JZKP&$mpyX@=l+@&Jy98POP5+;D|4py+zcKjF%KrfT?Rs8i z8&3;6ZE+i5hyEi4I~TB7|Lc?gMv6JwIXbHWOw26)w(K9se^mW9^*?-c{)Z0>>;J>& zKc4&t(%g*K#m&^#!p!wQ#tS%V|IB_-6Dy|wsOD$-U#apwB8a-(y*K?_zeG4I8A$-sia$jGFb=Fz?JY&<@< z3EI;pn?2qW&IS&HSL=+y(-qcce+6 z6DFv1)DX3)AWJ3-l~$_AC2N>$9`oOVQh;N8!v>(3;3Vk~SmU@WX;97nw@svkeYJ|T+|C^WSZwFOF~6O zWfdxzH(wba?s;O-|GBPp~9*gPFU=g~5JR~wR$VEYv%n+XQeI87* zgo!Y)kUZfSgP|kkOJ0$}8o=}^O@SlxGgGm!TvIErp=fxG5S?CQ8+w{3|7j`M6^80T z6kp9z7WoI6w`(FCS%DH44ee!20>x#rP@>RD!N6cI#IZ(Manofo7#>io(^z6*oTu9S zFR7^!LMfm^tx+r-<)VTJ?`Fpnzht`qnKhxQB}CKkTCOqZ)Y+-e2r*qWLuSrv3KkH~ zm?-!=n63zsZ3^;4-$WuPAV33^sr3Esm&`uiZf7p0`@jP_yzfu!$z$iC+{3B4w#%1t zGGWxlF`+P^M8UvKwLEz#NMJ=s1V7(y-Q`jlyDQ4dq|X=$!~gvGGeD(~aYhnEH42Rc zugZi0MEotZj8Hox;s{kuL4>ZPZ8CVgu7B1K6l|p($k7xP6$KmYRvY}0qE+Mna`;;+ zz@WsZWH7kGX;=FD3M67W^1tt^n?%`Jl4a)P^c*jitNlF3C?xU<0&N~WfxJ=I{5zfV zjD-nAvtVR@k{Z&$$6fs%jCkDW4G74lSv(g9#8eP!(H0K`4hVl| z0r`#c1!}iYgJyl@GG@Q|>oO9cxz%=Dx6}O`e51147GBiXRU>psDpGy;Z-+xbfFCAG z&RSxsl)xP@U!671v>S5L2#p24mHzwY6?+uB%&EOL3~g*{40;Jsj$41 z`7|wpc`;@&qhf{3R)^bHrNX(ganvpp&8FaDujtoDr!RB^CuX4)yMMdwEgV!uH-UT? zgS0MW7r3AF36{xqU+5xUMt<}_cp~Ow7Um$C#wHW0}zBHRPf5dw(}kQ zH*pthFX9cHiTq1lF;bDH$|X?=x)JsgwNMT60(I7Zy4Y0sK~n)xX)84A&!xWI=49s4 z6vPn~R15>aMOv-&m`n?XFt@)3Rym> z|A$&^sNC-F0D=w%f;wo}fb$7+y;joT;^K;uY$5??i0wDEDA!iD3IyPA%d`lZ4n>OV z8@UOmY)9ful6c4oWbzmM^gxJhj=eu!-ez~&9m*q8^gXR^{RKrQ7J1X(v4rKb!oR@Z zjZ;l*zHNFBP3#L8Gwd)H#OJV8N3{FJ3nm6`n#6a`Xk)}uP$=I7^XdDz>2YAZ{~f+i zD!xVnf4#xER_@hw!=|v9WnQ!0s%ko0(CWZZMW>D-8t|l_lvdYDakDhf~hrJ^lnwc{71gQRn-^x>~sh49mdg2Z{>it>ROW zOk#m!`$38wKn06}hu6U|o7b%e`N3Bet4w%11pGZwNpKa6EfYiOM9^th{CwFczz^i$ zP4ScZfLZCg^_O_MfeI_(?Y=;Nm?1qXbG~8j0P%?9Cuw=lmoY)ls~uLV?-5UZT--AU zy{&!oNu1rV`hs)ebZYV{>gtOo0M0P0L>seNtyPb0KM#x{fBu!OkTf`b)0Q30x~BNb zu?*)wy*xWtKWr9C&YdesgeZyFEalZAkHP!cL$zaj+&W{iX6fz2+6<-}1JN{3(06YG z`T|-a@7pFk5+kaA>u%XlRry*iJsW4nH5inNK;kX$PPXQE?fNgs@%t%Dtscys8nd^w)ezxPym!a@)WrBFJXa1H#5QOeRwpjz~UbXd9 ze4isV3A-|w-oMF1FrAFJU=S^$#|25!i(oAkZ43GJ<0FHc!QQ-pI7<6Q(b4Hp;iIUi zh%ar$0qtUR*-!d)x@3wgq}?;HEJxy1u!L#BLLc^mQuE4q3502+T!Uw)=+)Clw1b`3)56;k%^)zPW{@WBe}x}kq`3ogaADgPNvC zLoDQ6GBm+I3hg=j$=wVmy?Qsg#tA--FLidWq2EOoUr1kizM1xZZve+#(PAUu2JEo3 z9YI-Wo%Wey`FX>`C&qe&^y`7pI{7nfXMEGaDmiQq7HsJc|7P|gP=-c|p9l5^LJ%f& zJH%VhQ? z`#kbMAkhN-`^uImj9deATujQ<8*?wAbH}e?z4sI0t&|j*93L;9K-e3Nf+J^yN?QI` z(mD_=sPoq!Xd75eKeROW3l^8+(@2CMJ$&Ep+aGof`e2dWS|Y~SX_q6s-VEjWVY4o3 zz8B5-BvfCdA7(@~5oIAFNxseoV?Y)MiJmctWQec3qb!4)5%q4`7Zy{KPW#9Pl;P)^ zny$%1`c9xX0?(n@Y$<7JRv_ zN35x{iPi4*cH0*Ecq%*^B@XbdaMUD;X_@ZE#XJ|WrmTayCWqZ17cO+gsG@__7SL76&S6a9zwY*pnq z@nrXQ$~lucl_BvzPC}_(<; zq`w`4xLr%>M_ksd-yBT3MMf!gy_{5bPV=5tb>E*X>4m5N?f?acva=%k2?6p0r~Sk% zcK}y5Y?ZRg6_5MbW1beYZJr!9Ak*+3F|qS!55dewvWB-t$9lRw?<_I6!ASJ__h+h* z*r%9>c8Ne=@bGOp)F0Nje3xBp#QN*sYHyko!|yBWiI%1LwgJ!kBF~6+EC7+}A6Mq& z8uhqB{P1E82_UVmXkmDnUJ+s8IbkpLty|uw%F1{sc*(yqxg59Lo?Z~2j@j>W71^7k zWL9QcB34?gz8UiSJe;8e>iK}%&OJQAd4og@mi_5^>m}%{TUI?zl@~L0w?%3lZ=kp! zNM?MWh;HU+m8W@dofnZPvUx!qP<^Npn}+|gOoi~vUeMJ>Oj9#ur=)d5FIe6VVW2x@ zyT1o4rfF_ZFcj4^+gNNSc(ZE$#U^*uo{oVc)WIB#d~ApZ{W@#Y=?Fp}9OBpb=}s4I zP&uUis|Ja&{3-~U83JmlS7#Xnw5AP77g@GvM1;svM=Ka@pirO8%Xd;SBAs?;3oV}~ zSQ6@D4HiMf2xMZ#lqcZQax-kPTaEp@`}`Fnt@Ymg5Xc@g80EIvJ&}43{+)akXMgiu3BQ_cijRxIk^t;{^Ud@KazY` zOi`Als60c^_P|SR%Nx>Ra}jbxz0z8iySFG|F+)SCO-mS}rCx>Z=8gf>QZIO=GfCeC z4u&XcXkye_m*YiQc~Pa^6JT+1{-}v3r#kMo!}Zvi3r;em|A!o`q|G77mNanz4CO}* zCFjd|>o!yXaJS@BVb`1M@q6`3Pfwrx`m4$-$;7}$K){BqDwz7aa&EG@pW{n}Mu~;v zQl{x!LMrD7iJ|ST!f(oTQskY|B4@Hts57nsgv~f_f z;04oHF4H`vYoV~{O)Je>y4Sca(qz#3cj1Znk4L!y?oz5anygr|P8vE6)|D53<_gEt zE)iDiOvElJT3a)&P)14xV{@y4+^%Z-b^xC=w~~Z zR_CutD(#rQMO)er($%!>GPW(MOi>p1EQ;Ar-xc}Q+>nsfjP#t&0_}%K@jHP`ZnL{T zbF55y41M~^=x~=DsG4|Gbr-?#nFlnpPsEDDxI>9KhmRhDYiThix0a6lOwag|juWYZ z6b?gzS9(g5{V5ILKK$`=I{hEBLB9d3?l4v(jWt}Gu4*1wP672}G~sm4KB(3-=-3Wk zTioLHNqaGL_AZr#C>pKvuV9i~li%OxZ0SJ|QUpci)JKEX$Ir+uopP!s!-&_OUltb^ z2Y`lcyQ($MwZL$H{)70JJ(VQi4y`~z*T%_a%G7qqQf!WZx9iclwDUV7;9@V8{#s&k z*cRi#8DnE#3?-Nh+=0MSv?jm+L8c|34x$g4*nJShnw5Y0Jfc2rAwCW7Nzr|&0Tq@I zpp%^3?eF4^ZE7$AW|dgt8q-G`!$a!ymiRg0zyoz0w?)V+!|0$f9#jVeH3RL4Y?rhA z>}8-(LShPDy%x?4(e?JXZhZ!waqO7J*Db&%`ik87xallqX-)CQL{j*0D$&3iD#Nap z|L6N-4m2`R);Fs!{q<(&Ehxf3JDPMB77k*Dxo`gkEm1{@JBv~OXV@48!yGhek&r%d z6H}RlHkKF`NvLx4x$W~&$|F*Z2qq%y^psCQWKFHPk}Jg37yg?u@;xuctmK7jv5XG8 z>Thqr?dvj>dO7g{*bXVPWzY`V3o{BlSWjm>pv40;Db8)Knjn0`&Nu@LG0~WF>$4m6 zkI|%THjf!Xiuv1L%H%yxxJnQ9Ff$<gC4-IkF#^ht9?UX6s zNSO4l7E8qwIhw^i7NKa^q(W@*OqApRm7N5ni{^FUX!G6!x6E!#-(*^kFudS1=PFlM zP+==Q-i*q-E~@MBLa2!z*vRn$`rC18%HxKnvr=x_g@q&eK8Zf31ZxK&E!@TA6AHvN za>aF*$iv#?n4G9dI&9Nmt{3GiP45mwzwfcCy0exLNdoQzEc#G(g}ANcQM-YHNNru& zt|7yS79o4{{^)P>RIT!I%FJvcRr`8$@8L93{(J`o(tEectd@!@t-dat2e(t9INC{R z^m~g`vE9(-iNj9HJvt|`eh;fwwW6p1GWRsZ$pPqmI|=#y)$xXzgB)Lu9M9b-I>05D z15h#j$#CiA(5|6{h|_ngPKZT0lLtMAw;?e2Sg$A65=iAFuH9>aW;jYa&$@D;KSRDH zJn|uCsj529jDkvf!0I&?@nc3%riAOjJ0UgR51iOJpvF5WtYx0-g&xX}G#Vub12@`B zuw=%+MSfyuP1vWKwc;o*NPz&8BeDm+aVPexqOY&7vIOfwsS>u^+j5lShVSjq<39E5 zhYS#-M_06BB_TJOrg53e1ADKAuPW0TM!_<4KNp&slYMhMwNLrPe=d=J6 zO8XzqV@6~5fqVx~WDF4iM~f+h5{J%rP~Y&E`_s|J2r77(pP@!0Y**QR>{2d7Mokgk zg*-LqwsCSk;jmkW($Yz_C)3${t)On_&5K{^3Xaz>MxW7V-MdpRrLp{JJ%d0;Gzic_ zGMUPBhSA%bLv01wgxZHA1oT&x*5@|22{e->rN9#9 z$usD+yrKWNV7=?{zL{CI{TNngb=XXCSZ_JE zn}_6BDL!*iL+@~soK%j}bgzgVtBa!MOvgf=n##ss)J!V(HMHHJ-hj^z=@`Ks9AUkj z*lJi03hupL_s8|3W0*Ojq@s$Dz#T16(v4*z%acXL!rACDeJUx!Htg>3h&TRvX92S;1VxbA;Wzj4SWf??G{nG`?VFZby#F zN2G)Epy=3SGr?u$WZO(N{IZLW%f)&-xo`9v%r-e~61RnzFYA6w!wPBUN)Ml;YTWPT z&g-=_*B>p6x${1+&tn(3zvfO4^?c!^^(dM+U zqaFL#l;fr+IhlmGr0x!}T{I&$76!!49LHQK^d|bS48umli+6@%I@GoWt}bAjBu6Fh z(5|MdgTBC%!H1T0P*j1%8@6=^h%s)1(Mucll!KxE3g}O@G3=mV(YNYoTPf=0md~xT z6v?>G2aRySN|ibMb)JyarV9zR$p)2e!;S9bnxCGQZGsvtX0FLd>c}EZ+Yu6Yu`;>A z$HZ`PEfV1(=vG;8fwl+Wrln_aLQoBi;ND=@h-|fRQ2?M!fbvU^an0LUIaGh%_~Cnn z%!(YX(bT&A`u6FaJIy?0vM`|)ca3_jBdZ!-3Y2@79S zLj9bSPI!hn8GaPwsO5TZ^s?l$L>OHWy6j}$R@U0=*wIZbh((!oH4Phcpg;UrK6^m^ z#qp?kXnR~!b%YZtRI2v~cvZBI z+hS;`jh{7|)!v@6Z_qz@_X23>FqQBRm(0qV`I27p&;}C*Xy^p!8QCUV8f6lxP6c1j z8dFyT&e<&{l31_`bU9-?kddJ3Tw;f(j(a?|ypDV$zxEyL_t7CBWt=Zhh}&>z4Ww(o zx8zou@+XHU0W_~sy*Dde)b>U?z@M}BpBh(3U#RqJFO5mdF7}ZvbxG@*gy`puy_sn^ zez!+iaeFC{_-Qp-imE4Fzf*@?;PK2g4cNh2+h)t|GeVJMS?w7_F@T)TR%lcbI~j35 zr(5M554%xp)sR)Il^OL91xD`j5SF6mtmf?Kua5+U7t5k+;+sP+1$;+j7xYpl>tV06 z_ViZKJsZ}AGf?Nsm-Gxh7K3qS(;{7c9?)pMeuvsg1s1^cdK!NU^g zJE!aL!Li?)A{>QsbR87hx=4jwBmyC!89Qzc`cEq@V+_m_qNv4VK27xkTbo>8-Ff#v z2WIf9yKCu=*>5^oXhPrK$MtBvJG0qUx}A~1>)Ukg)OOM4v(Tj=H~&=M+@BP-S3N1j@?fD&3Up;u>=kdoX!vA z!VmV&jbsl)j(OcgkpObe^h-nMzJ42t@H!us)@XDUYYW(hvYpKCx;*zcua1yaq4lp? zc81p))}gtCRY2!de^Cx3AvkPSeFl{>An-X}tyJ;-tYzJy^PzRy|CG>mjFuT>Zf>q4 zG&FQ?e0Tm$O*4ZzCB}OWKdH>um1Wsqnt3ekYKKa)b){jS9%m2FXC{g8fST z@oM|dOwQQ(TweAHv}#Iv1ud;hW@BT~_~b-2?i2e%hSm*MeN>GZ6xt2yFN_xMXDd$j zz^GD+`^lmW2P2F}i}jr7+1W6`NWmBCf_v+!OtOIY`(=zbHm+dmoIw_!^LBTB&nshw z$l7&R_#btkT#4k&vASSR+2H*C?;D#?vusV_mW*;r-f#xgV{;OoWQ@F_I30L2H-dOz zgKM>jt(*{4`dW(&8%4D?78|D^JgQ8JWUd2{&~z{eXia@|WG2Sv&2=3$x}ST$tSlsR zcvl9>7Ce6O5}`0;t`iQGWUin8iJy{h8vf!F5WJ_1b9Asy8!OhalgOFE)H!t-+&ClX zBcOk>9T^bu2DEs)!-T0(0k{Y@Z3=R7e1QYYWZWZ45s3ILIrO7_g>3ld@KWvZO{aa2 zb~Q^^mPBGjj+7s9@vxY)yh);DKk{^-%3-vZl>IMZ44lWl~gl z)F}$t$kCN^qSCVN;gd7X`zt#4RYhscj`=!LATt5U73Uw`x>Rd3L6Y{BgFjT_9pd@Z zCL&gNl{1I?C&<^nP3Qy|(4f%l8Z+b69sN@^N9|Tn1& zFVVs2p!nwjoG;6#^Q3m_rA974Iracn+{!2=krk6BY8fD0@vFd?%NwZT=Xt(ZL`_86 znkOio!r6XFC=5L56vXFO9iA^CO+DsFhQ46`8@kRQj?@C;F@S$e<>_NB4uDB1osYI+e9k5YUS)r-}7y&^5P8!D!ilGvsIiiNw0}W z2II)wtFX{AzJd`M9-T3xD_+s&w_52MO_}R`+K?SLPbH z+CfE)!9Nww)D7##Za(E@MVbmwNX zN3VX9*nduN)Mu9xDDq!_^qxBTYr>7p&F?FpIfTeP2CE^n^zgu?7GFZ7EAYfrJ)%;?dZm9GijRU1Rvu?|+6neWFthdh1 zpj73DcPa}(CW4b4m5jx^Uxa7sx&jToD5@m^dt9pN{dj@?Rb#-H_F6DBG&BWtFcMgA zbv@U54!>YB>WEK15I{H7k(r=oWDI5OKa;Ckt zOjIWjqKL*?CLo%L5`n1r1;2OTkwWN7ho}OL9YSP*yV;@6i;h#hNc9#bpxSIs|5_?A z2j*bZqJL&sd&HjaXWDUlnCbYzY&n&N*)@~*5puU@-gZQ`6>p`q#OO54$r;S1S4%px zJhUmE3snnChv-zod9G&6fmJg6GN7;SMTP-6Y_eKt;}w0Aa6C`m!*i-`)|o{W64DhX zunXf9zC%4`O0sP}#fFjT?~znOD=5|Y@*O7B0BRn<{W|-(B$MVp9#lonxS$mWkM!fJl+{e12H@_4yzlBcGWTdNX$A5iWwd>nvelbbf>UaRaIq8;31BJ|1(#p7d7h?^X!-rA9!eC zC-J3r;p!XxHii(0DS9O-a7y}$PV&qCqWALyUWRXpaK#?3;kJ)%z}pgy%dm@tLol?c z;3Y0CXvL=Dp8Qxt(Xl?^4NGDe2SI}IK43~f39}qt4c$;X@9{JJ`>*fo&lap4A>0T~ z9N-zv{BGfhqqY{+%a8|8MtpS<1LWNoOjht{@R6o;Q-Q~L8daICB$WPyF17sgq6rKV zo^HitNvC&b@tJXjHJz2*6Lo7Cna!^Q=e(spvN@XGBb>V1CjPc>nfYX&DU5wb_xmF0 zr7b>dj6Tn4`loJ}TYhMX_Ih43jy~gyjjTAZ7QLbZOj*Xuw1_Z05?@Pf`5`1ZTZbpW;=~|P`E6_vQ5Wycoy4H*Sxu2A;u84mU zl=47%x?C$1@3+irIvVX~%g;y!y$9&UKIFRf{N#PsbZYoH*~U^8XES|Howtf$auRDU zp|7gB$q`yU;LqW_PHVuwQ8)T2V<_!KsGK zolJ^8BU(%)Yy1;vYJ6KRdLTIXMG&lw9;|DS6lo5s_G=v6;&@NubK znRV=FmEi*~@{D4(q4#ld&i?0wA>Y}-+|3l-04?U2c;WDa3Oq9=P3$)c27Se{CRpe< z)6!>`9+qesnn}7k>&|i#A$r#qkR1Y!YFdJ(7j4<;!=i6b$td69o5mW%SxGkZ>%bRu z*zcN&caQx|hD%3)wzp?qfaNb|qXEjHrDXdirc)PlM5#BvyG+NP z_SmQ`&HA4fuo`qhX$yU;Goi)`6$YITDK|qfz9S7n+0kff9!;*VcIz$GxA+3kjbN%cp`4nLjL;;D!|Ww2I2SWHwv2IipZy^VkE#03 z;~tdyXSDhXEk`iDon@z{^(x6*dDC}VzytXh5udY3H1zEZnBI6rpVO!dCRKsrKst~T zD71S5OeN_|*p7+fR=-xb%BXvRLePj7Vj}#x!>lwUdMTz);$5Lj8`?)T( z-oDip2nYHoikYWHm>Ow=rx`CXlj6~pL;3!_1TcLKTN~jwYv_pySwn4`w#Q&zAy0rO zye+JW0ut9BB9n71hcs`!4y)&3gZx$k^CHHhQ!ukP0l))c6QDXrnQ8>3t zP>8a{+Su~K^>ay+N7hnR0Uwy*^4>O-D6@1aLgfS_tr7x-*wW;%=?xhzG^y8Y+Z^Tz z$CdyF6a6upFIJC|1ht}_GwfCS{;OJV8K08imqHJ`1#DNN>ST%7tp^jt*#R&Qb!MkM zy|GYW4y6a`al>T>#JC%0cR2@Aq@pP?EEwoi(0#Q&<%~RX1O~T^Hh~Fk_Nj*6w4`NQ5|sJdC@qa0iX zyA+;tN%+d;%KDBx?$oIBq@pqf{L*Hga_78^GN+vJ4S2{#sEp_U*3>a;Ldbvav*&H| zv|FNvUK{}vUoZ1#6B90nv5lH&wwJ(f#-}H*6WVe+0t$L;P$ptjR6D>1ydXnch#?+A zK0h9NFL&|+=wEvr3&uh}T@QuUAU--8wlg4tehy|$$YRjjh&$N=lRCh6+)Hm~#d?V9 zuTsk!cjR{ZiZ{GH5Qu)W9 z1B+>yFIM|eB292e%m7yM*W;q5rwo+kPVug@d+ftJq5wfG8_^`gZ9$=iE6o$H{#-YW zp@wvRbZ+|t$;c_4kMP{?Vc$9BFp~FFSiPY z>tXEP_`T3o>+zEAhqbR_%KPoY+-i}s^aqnjs7RA{&w!#Ynd~+V*uiaB4BNQ$>;oj~ z)htHGfmn`eFq-(0^OXd-tr-II^xSWP)_1?q3+M;{5lDn7cbD4%oQxy^`SPSC&smmg zP-LGQs`1oH(WS-XD2uB*r~X*bTBd4zkV*<{q!$5wq|4#>9H)Wb730e?W(I6ynlZTT z3<`hyzg`<-&3&CwiLOg2Xt~*nqz{X$v0RK$$j(h^bKLo@)~mVo%X_(@awX%r!SdnB z7WeuKOL;oTBeo9e@YZnM#-HR(-cH@-a_n4wngVvrzCigpU;yb7>NgJUMv4&lgiL$Z z)eTikPGUOH1M(|Oy9IQD-9xxfwfXRetMw`{K%6~nvURC!#IkWsn8|71fe~z5OOVg( zoYywQE+5MqHI^nYhj^6ZbMYcY?009zy(%u=^YgdjdjC%(Fe5uushq$x#*2;iTT`qr zF{_1GKX(#=X}P(4c`x(~Zj+?P0kgGxPA|RkT?`88tOryS5Cs?-^6OPebTU@04lK_m zj-B{5==k4-xgBbQ$*K(VYj@4DcB7%h-3rT}mJT?+K3x7Tb_WdGw7-7OslWZ{ZzrNV z;5apu_$0lSEFi!2QF`nz@G<2iNnvMe7fpL1CLk+&&D5YaOWh5w<&~kx{=${{z(?iN zSp7x($R)DEde)YzLjJ8D=mUF&c|feze`>EFR9Fu{c_xE=zf=Omzq2T`bC8o4rjv`R z_VCL@P&vTiHTXqh8;>|+gcaI%d*ACmj^I1`egj*w!H67Ey2Kjl%pA{^=nVae5CuXX zVKexe<=$xW_H?Z@Xz_?uU&{wt?*o424tnsW;Kgd;P0&SE^?>^|{`WPn8OfJ!Wt+}E zQp4nF&k3h*qH_eXv-ln65eG&J6*h5d6rxsK7;$bU`vs36SAUR-OgEqUQc9>1@uAUa4!|>Dj-+v*| z;9s9m5gwIu!EGuJwaX+0HSLR0d=;XL1_AQAG4nxlXU8_h?di&c z@a~ZmFRDcR17h~$SacO?I)I~rdI<^fhHPeiR1EuxJT~*XzC;rEWHtovL{YDQl;Wd^ zqYGOWg+{F_HW8mY9J?=sqENaXT@O^WvBEu-8u*p{a(kl@luO)`#FxEmAGp_@RnyE_zERC^>;d==tg zb2SVyojr?Gg9lhRv4K?^IQcgdPZ(S1psY~U?~&vNP)Xs6`kueT7My-vq3Sl+I|Va? zhUIALMn+B!d@AH&A0b61t()mC5N&snJ7KmVs$8LE+0u&=ykX0*k=pt@RX#J}tX%|( zmGoxs?0_0i4yU+<{*s7Z>=P{@RHU45MuxxIMZbM0V1q&*hyiupsB6VHNvH9OAaPX5E%mDu3g20 znlG_~q6R@cJMKhVna#hLMV>2{W~dxqvi>rx{cD0gN@}D<1OUeY5;Z_=RI!Vv`%-fR zMd6!gBh65LW;|@X!q?xLkMY3tz8)-7aQX&g8OaTALDM~GQsZoUIQmCni~`*z>-sFe zpB_?i1nr2W2!()P0J&jkMY7zmt6eeOXRC4yVJ5-e%db> zCwNL&w+L`wvS3R!Ytok z;@n*4I>31uDsDe)Y}!@tB^Q=q1jCbk?C#Yh`Kh=G&Z;jbs-;3TB&ko_a>nw>m(mughWdX!bK6vZDDeAo-r4*wocknMb5hzOxLEdSu%S!uvl(Yq^bq;QM zpzgc}4d)=;Fv-k8(p`V*7dJj`b&U=OPBL(I!&wMSOC#1qUVY2m$w7tMC0v>A46J-wSfFmrJl&a9&Isze(OJGjpt60)>ss_)m{{6eQ>dJ; ze$uxUc*Lia&^}GhKs7@5yyhr!meY2)I7hBSo?c@!UgT0VIg$^u16bKW?T^8&<3a8S zX_L!mFe_ngKs>G{--FHRcTH4HE0{{!%azAL@bXiVNzTOav@_mf`)ITQZ?YQZv)CL` z(~){+4i-LO&~k)M>8(WparKK8JR3PLqT=%QXK*~wC%*Kncbx8k@W`3d;&YBcLC*u# zBt7l-N{9bdKQwj&aXY+3ITXsg(egJ+ZZknS3O>}FDU17U1i*jT zLTYLMKD5O`flcEW@2MT3H1?W$I^-IfgNf6sV|pVVh&e;$+l&bxj18^>egxQbEb!t^ zt8euTwIoP0>?s3tUOTO{Fid)bZ*xd(P*rd4f$>u|p6k#rSU@9k6o)#J6$y^(IO8jS z4GVUBQ_e*#+&8@A*3qA^;v+Z@%rigU*|avIx%yx{Le7tfR?UVHN>Hq>LS}{CV;V8y zC7)K_u|$0_R8H1WN;&#T{l2IQNN6)i-#so>Q5nxKjm$lNRW6(F#t2ycTI(wbT{A9^`YmkQj>v6EltJ#n zgaf~Ny|R^6P*0og11z*;klAW3cqFh;_yzC4T+n^bVxvtTBXxz-k#THd;sIg>ZD7+~ z#CE{^hA%6UpJx4(-&?J1tL#&(aKxF6Oxq|kUz@h zJ{s?ST<8$_dCZrhZp`GmiI1y;RHF~hjLF@$?P1kz zxa5yFlL3YOWkAiQQ=d~!V>AxaF*lkG;*ZU9*f&NYMXO}3B@a>Ubc=-gy>7iT)L=sM z5w`2m^z>LrNf|j_nVz{Dd4n0_Knc54Ta%Jb#fYnvB58banaQp;mpP{uml&v@H}EqA z3kPJ)=U+`Uegv{3Eo6?Y_aI)NPfaL~a(#ecTy_SFXU)9oA<$DXd+NU6GnV36g}4t7 znNjFFn@QD)$Rj8EPXNPPuSAbt1l^noeygJ_kJ3X&=N4J^9hj}*@q1x57_wZcdxy-w zRCMxh2*Av@zNCna&*|sfTy$RcjI+ga3mUb;g5f?)(ADZkrS%~CED@vW1%8c)A@YU( zvO`}mnZ{xky63(wsfDVE?4vwZjwO0%a_#?9vueRA?0C@s`YR=79pEs`^mrk9t=2+? z5uSQu76aF{M8YDs(}xK{|I9>;LHc#}X^uiiSgjctpI=;WffWf_|K`6N^<|`x+NJi4 z0ZjsZg=!4+hyWL;?{6QGX26es<&-hm%|$B3i(j31H+2s=yey<%^9`q&cFo$W#pL>y zMY*Owl$4JL!I`B=a5S})s*IH>yB=JEsE;MpXCb`M3h5u}xIfm8wer5TS*}(}tlfv; z5vufqz<6Xa+8Hep_*KSgf?7VD2}v++l-ohUL4?vpjp)xBX9dhmw8E_>L3&Oro#d~+ z=}>G=;iQTR5#ZOE<3Hr5P04A9vae?J7Eo1^7fevFm7akeDJiDj+l-zM zgs)R6M4KUw)xpQgnkUcnqytzkI)1;35{4d`83ZA$UU$t9Ut-))CQx)|6k%@XC2W|f0c{A>?XEtw?yOfEl%>ucZdtl{K5o(fnbW|DV(X8LE}kAP?7`r{(4_35%9L=}qv^$;J5#@7{u)nWT~zi5-D<*~xpG^z+#-j_c1(_9 z7{1_HYG`qkk_8CtAlTubK{)&)`5SCkAwHNHqwmb4)JSoh7JLw*HA(aO);PpA#?;I3eq7WMT$XCsz@jFDh3IlQ~@c{rI*kJ3<%Omz)R6k z#ZbjBT<5#*?3we|*)wa_*?XTgYd_B~fUh#RHnTypyX=zDOQA$mD827-4osQjig3yczZRlJJ0l)NE@2pK^Qs(;rjxx0kab z{QWA-^Qkbwxy@WqDYZyx3%+Ced4Go?vf;u$x# z^V^aD+?NsMQ7SGz_#HfCg{*K**4JHsO?mflJ1CF)$UMi<4@3*z| zRvdDQ2fG0fc-)#x?)*J(A&Zpe2+9o?CuBd`w6f*Um9(_=ts=oKOQ;8$UCWvNpjl&9 z+y;XR0EHX@TMLMZTanTWSf@5(rnT1*JIhAM__ywdsucpzhXN+}(9p{!r5lmg+s;s- zL0z6#SF866rt3$W$&MnI{GGZCUuwyTRzqNT-}dA;`~F`dO|zN)L64YArs#UJ8<2>n z)7!WDKEfDaefUoXmyRmJf$&-9?gR;Dgp}u8jKov^`GY4I`}g0bp z?f8WqaZwem*T%?WCePpe5E3_bt1VLh0oC=CJqBdsL7m$~pX6E@O}Rr$8USakWZD2; z=tHERCEd)jyl0hKl=iH4#?r??<8j!Brl+4%iFwlQ)(ajM7nfTCxA=I?+k?R~<85~W zs3~P(KYvNUeo|mIIG`BMC+2SbN&JoZp3T{!!b~IJJZ+9F1M3@6V)%s;)h*@;7L5j7 zswQ4xax+yRduNmaG^g(R`OQ~CetSQMuiBVXccV3U>_eGiUbE=Mn{~e*e3mikX3>&W zI3iGN>jF29k*@V!I*gTyS5Xb_tU+UGjVQ&v*cmhP9Af63%XNrz z{aQO;_>ADhI~DQ{v5nA1ZMiZwf(XfJ=-?*}Kj?z5EG6APQ_1LuI65LU_ZRbYCVsZ^ zYV*VdRmN3Yy`A?6lgmfbam$3pXGUYNwyxtY57LjghbP64`3JAMcdEE^*;f?NxD0-2 z5qi)$ZlHMGP(vhfKvsUELyAT^_%uJFVDxOgq!8zt)_8ZAifyo zX)V&AM!$=&^|;G6D7U=$9QdIM`aYPT!)aT$Ozw2ECtagvG>`ktp)a@EzYK)sW43BC z%Rl|Bp_aCLBP{eDOt!v%qk*oM70QLDa0p9{gryi>haKq~8%M4``PT4s&Yy#o64MYm z&~Y)0C_+JbLP>h7(dkL><znWiG$y|xXF zob62Ag@j3kEjW7-D7An-Ex}CL8uS@eVWrxmI;tbDYKjKl<;9zxSyClqYzpe_jOKER zG$v-iS@K$7ipHwG^5Q4q3rG-4ol0KFS;_RfZ0q-5U#LSx+>6Yc*0t3h-W~Avaw}la zbRJRa!R>t}S&+EzT+XLxmxq~qB%&Ob!7Q-&9?M+wL=r#m4guM;&YfvYzQmMFCRYt z>I~&IsQW8?W>gg4m-f{5?-4Z<)_KoKmVn4+&Z9I`k@K}XOdE~qXYeM{EO7T3veFTi zr03Y!3V=HA*>MP&cR?H<`pQPleT>1a)cN>$OLG)Y{BgdANO^(wb$raZoyPt( z;gOg8w2C2rTwm{oAqqfKmUEVUJh!W$Z<&o8>IHa~B!AA+-Id(-`NvK%PgWt*wnMc$ zYVmS|yoLRG4WHf-)h^#W?~xzoqoYwI#LIg7lX2q$dvp}jb4+Cd)}cYG*m02HOG&%ZK7{F>Xdt3>%T3+Lykl1T>KqO;nTy*FIc zdI)RaWq%K7b zK>PRL^`mcoN%}1O_C3=BC#OPM+40}N7E`nXb?UWLl%SW{)%m%4hp{ay(hs|C-v{bR zBtKB|OEm+Lpo(5(Uci@B+MA2927Ql6Hk^0{gzAvOA8PM*K;opGt9blc;a$WP9DAd! zifSiw1%;miGFfh*oAw0niA?1$;>j%u{xxZV@OiD3@!X>&%#W50Y}`ajy1s>GgsVrB zo+_X8M-l6{Tqvg&y}$x#RRHY$8-`i^KsuWMV&md~4=-SHHLO5u(uYjbAoDchqQh8C zd{jaZ*FbTbk~>PNt0-Y6jOYah?IhyM{Ls2&wJ#bha%a=zQh-WST8q9;;ViHvU&u$@ zs$mmX;|a^vN4bJG8gIfZG;RFuEQaK6KK}9!r%)+AoqXNkNLV1l%yuudt8`P!?Jkm; z3SmFX9$x%lEXb4G{Z|AY95{6;<@OHGoC9$LNL{a9?Bn!)SmIH>nuV%0E;9R-sy3_o zqfKp$Iwg->+2kTMzr*bwnRGmlksS3JpJv=&fy4R*6(9gJQ`vTBkKA~9^YWwWPx%JR2xOq=f|*t7reLJJnYz1(5)l;8xt{J zOYLp!R?QXucJXSh2-LQQ@p$cY?QiauFO$R#as`-4Yn|~5bHG z+#s)Bi|-FGw-m6#+ytjJ9s!P!t4)WSkoa7x{L6hI0boN9oPYVFxMF{L=z`ib_jahf zm-s^Yw{*`FhT~LONIzv!eFe72gNlJuLBr6M-#%!wzIO0Ks|Q5q{xCv&LHZm;q{<*~ zIb?|u)Yj}dw`o{^{_YV?+pB-Y(bS`1NRR|zn56_fc$49Srvdi|u zw3`G$#rds;B&~=WhLP_KLaQ4x_4Cl4aYb@08(JF4;4b*CYxcjcU8;WW-Jt=+{}@5ZgUfgTC<=E9U$)z$P9!)sa#dUCJiF0D9&_k{I zdBw}kI{*%Ao&Dd%F#>_EGOO44{p}*wGYGIU?`JE0iH-Wdfo5iZ&VFkMC-;b~&%0$C zmjCNlx$UUX36r>}bcw0H@^JQ_c4dV`2|6Rg4-%0Z10eW*8DgttVX5@n|*lBM-qNS?h*H!TP>EliIq`EDy&OZ$ai)$L(TjhL5EK%!l{v zZ2zBbMCKv!0WxJQ?`)IhKiiBq%eQ2AGT>xGklS;WqBR}9r!ryh1TJd~X1H_ws`(mn zag?NVITMg#F3PWfUTDRtx2iB&(`8Sud2Y$_Np+CzwMo97WwN;xwXGqV{ znU)YNN_D;vmyT+~1o}|Br9rI^ye}_(8YA4lhW9RaE7FR7lqsX3?_}%pxYZcXm?bjs16Ey2zDz?AAhy z9CobkNd6JillvpNKi=*g!t?QaT=0(neP})hie%Fhq)_TG;a4x_;|c%Zw+s>XzrUux z&l_D7CWk-pW`{nla%zdLP+Yp=U4g$z>TnE=l?;E^P(BA Cu`ROz literal 0 HcmV?d00001 diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 7418ed3e59360c..f955cc75b0a367 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -829,6 +829,7 @@ "glue = datahub.ingestion.source.aws.glue:GlueSource", "sagemaker = datahub.ingestion.source.aws.sagemaker:SagemakerSource", "hana = datahub.ingestion.source.sql.hana:HanaSource", + "hbase = datahub.ingestion.source.sql.hbase:HBaseSource", "hive = datahub.ingestion.source.sql.hive:HiveSource", "hive-metastore = datahub.ingestion.source.sql.hive_metastore:HiveMetastoreSource", "json-schema = datahub.ingestion.source.schema.json_schema:JsonSchemaSource", diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/hbase.py b/metadata-ingestion/src/datahub/ingestion/source/sql/hbase.py new file mode 100644 index 00000000000000..5a1a83e99d1948 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/hbase.py @@ -0,0 +1,540 @@ +""" +HBase Source for DataHub Metadata Ingestion +""" +import logging +from typing import Dict, Iterable, List, Optional, Union + +import pydantic +from pydantic import Field + +from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance +from datahub.emitter.mcp_builder import ContainerKey +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.decorators import ( + SourceCapability, + SupportStatus, + capability, + config_class, + platform_name, + support_status, +) +from datahub.ingestion.api.source import MetadataWorkUnitProcessor +from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.source.common.subtypes import ( + DatasetContainerSubTypes, + DatasetSubTypes, +) +from datahub.ingestion.source.state.stale_entity_removal_handler import ( + StaleEntityRemovalHandler, +) +from datahub.ingestion.source.state.stateful_ingestion_base import ( + StatefulIngestionConfigBase, + StatefulIngestionSourceBase, +) +from datahub.metadata.schema_classes import ( + ArrayTypeClass, + BooleanTypeClass, + BytesTypeClass, + NumberTypeClass, + SchemaField, + SchemaFieldDataTypeClass, + StringTypeClass, +) +from datahub.sdk.container import Container +from datahub.sdk.dataset import Dataset +from datahub.sdk.entity import Entity +from datahub.configuration.common import AllowDenyPattern, ConfigModel + +logger = logging.getLogger(__name__) + +PLATFORM_NAME = "hbase" + + +class HBaseSourceConfig(StatefulIngestionConfigBase): + """ + Configuration for HBase source + """ + + host: str = Field( + description="HBase Thrift server hostname or IP address", + ) + port: int = Field( + default=9090, + description="HBase Thrift server port (default: 9090 for Thrift1)", + ) + use_ssl: bool = Field( + default=False, + description="Whether to use SSL/TLS for connection", + ) + auth_mechanism: Optional[str] = Field( + default=None, + description="Authentication mechanism (None, KERBEROS, or custom)", + ) + namespace_pattern: AllowDenyPattern = Field( + default=AllowDenyPattern.allow_all(), + description="Regex patterns for namespaces to filter in ingestion.", + ) + table_pattern: AllowDenyPattern = Field( + default=AllowDenyPattern.allow_all(), + description="Regex patterns for tables to filter in ingestion.", + ) + include_column_families: bool = Field( + default=True, + description="Include column families as schema metadata", + ) + max_column_qualifiers: int = Field( + default=100, + description="Maximum number of column qualifiers to sample per column family", + ) + env: str = Field( + default="PROD", + description="Environment to use in namespace when constructing URNs", + ) + platform_instance: Optional[str] = Field( + default=None, + description="Platform instance to use in namespace when constructing URNs", + ) + + +class HBaseSourceReport(ConfigModel): + """ + Report for HBase source + """ + + num_namespaces_scanned: int = 0 + num_tables_scanned: int = 0 + num_tables_failed: int = 0 + dropped_namespaces: List[str] = [] + dropped_tables: List[str] = [] + failures: List[Dict[str, str]] = [] + warnings: List[Dict[str, str]] = [] + + def report_entity_scanned(self, name: str, ent_type: str = "table") -> None: + if ent_type == "namespace": + self.num_namespaces_scanned += 1 + else: + self.num_tables_scanned += 1 + + def report_dropped(self, name: str) -> None: + if "." in name: + self.dropped_tables.append(name) + else: + self.dropped_namespaces.append(name) + + def failure(self, message: str, context: Optional[str] = None, exc: Optional[Exception] = None) -> None: + failure_entry = {"message": message} + if context: + failure_entry["context"] = context + if exc: + failure_entry["exception"] = str(exc) + self.failures.append(failure_entry) + logger.error(f"Failure: {message} - Context: {context} - Exception: {exc}") + + def warning(self, message: str, context: Optional[str] = None) -> None: + warning_entry = {"message": message} + if context: + warning_entry["context"] = context + self.warnings.append(warning_entry) + logger.warning(f"Warning: {message} - Context: {context}") + + +class NamespaceKey(ContainerKey): + """Container key for HBase namespace""" + + namespace: str + + +@platform_name("HBase", id="hbase") +@config_class(HBaseSourceConfig) +@support_status(SupportStatus.INCUBATING) +@capability(SourceCapability.CONTAINERS, "Enabled by default") +@capability(SourceCapability.SCHEMA_METADATA, "Enabled by default") +@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default") +@capability( + SourceCapability.DELETION_DETECTION, + "Enabled by default via stateful ingestion", + supported=True, +) +class HBaseSource(StatefulIngestionSourceBase): + """ + This plugin extracts the following metadata from Apache HBase: + + - Namespaces (as containers) + - Tables with their metadata + - Column families and column qualifiers + - Table properties and configuration + + HBase is a distributed, scalable, big data store built on top of Hadoop. + This connector uses the HBase Thrift API to extract metadata. + """ + + config: HBaseSourceConfig + report: HBaseSourceReport + platform: str + + def __init__(self, ctx: PipelineContext, config: HBaseSourceConfig): + super().__init__(config, ctx) + self.ctx = ctx + self.platform = PLATFORM_NAME + self.config = config + self.report = HBaseSourceReport() + self.connection = None + + @classmethod + def create(cls, config_dict: Dict, ctx: PipelineContext) -> "HBaseSource": + config = HBaseSourceConfig.model_validate(config_dict) + return cls(ctx, config) + + def get_platform(self) -> str: + return PLATFORM_NAME + + def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: + return [ + *super().get_workunit_processors(), + StaleEntityRemovalHandler.create( + self, self.config, self.ctx + ).workunit_processor, + ] + + def _connect(self) -> bool: + """ + Establish connection to HBase via Thrift + """ + try: + # Import HBase Thrift libraries + # Note: This requires happybase or similar HBase Python client + from thrift.transport import TSocket, TTransport + from thrift.protocol import TBinaryProtocol + from hbase import Hbase + + # Create socket + transport = TSocket.TSocket(self.config.host, self.config.port) + + # Wrap in buffered transport + transport = TTransport.TBufferedTransport(transport) + + # Use binary protocol + protocol = TBinaryProtocol.TBinaryProtocol(transport) + + # Create client + self.connection = Hbase.Client(protocol) + + # Open connection + transport.open() + + logger.info(f"Successfully connected to HBase at {self.config.host}:{self.config.port}") + return True + + except ImportError: + self.report.failure( + message="Failed to import HBase Thrift libraries. Please install 'happybase' or 'hbase-thrift' package.", + context="connection" + ) + logger.error("HBase Thrift libraries not found. Install with: pip install happybase") + return False + except Exception as e: + self.report.failure( + message="Failed to connect to HBase", + context=f"{self.config.host}:{self.config.port}", + exc=e + ) + return False + + def _get_namespaces(self) -> List[str]: + """ + Get list of namespaces from HBase + """ + try: + # HBase Thrift1 doesn't have direct namespace support + # We'll get all tables and extract namespaces from table names + # Table names in HBase can be namespace:table or just table (default namespace) + tables = self.connection.getTableNames() + + namespaces = set() + for table in tables: + table_str = table.decode('utf-8') if isinstance(table, bytes) else str(table) + if ':' in table_str: + namespace = table_str.split(':', 1)[0] + namespaces.add(namespace) + else: + namespaces.add('default') + + return sorted(list(namespaces)) + + except Exception as e: + self.report.failure( + message="Failed to get namespaces from HBase", + exc=e + ) + return [] + + def _get_tables_in_namespace(self, namespace: str) -> List[str]: + """ + Get all tables in a given namespace + """ + try: + all_tables = self.connection.getTableNames() + namespace_tables = [] + + for table in all_tables: + table_str = table.decode('utf-8') if isinstance(table, bytes) else str(table) + + if namespace == 'default': + # Default namespace tables don't have namespace prefix + if ':' not in table_str: + namespace_tables.append(table_str) + else: + # Check if table belongs to this namespace + if table_str.startswith(f"{namespace}:"): + # Remove namespace prefix for table name + table_name = table_str.split(':', 1)[1] + namespace_tables.append(table_name) + + return namespace_tables + + except Exception as e: + self.report.failure( + message=f"Failed to get tables for namespace {namespace}", + exc=e + ) + return [] + + def _get_table_descriptor(self, full_table_name: str) -> Optional[Dict]: + """ + Get table descriptor including column families + """ + try: + # Convert to bytes if string + table_bytes = full_table_name.encode('utf-8') if isinstance(full_table_name, str) else full_table_name + + # Get column descriptors + descriptors = self.connection.getColumnDescriptors(table_bytes) + + # Convert to dict structure + result = { + "column_families": {} + } + + for cf_name, cf_descriptor in descriptors.items(): + cf_name_str = cf_name.decode('utf-8') if isinstance(cf_name, bytes) else str(cf_name) + # Remove trailing colon if present + cf_name_str = cf_name_str.rstrip(':') + + result["column_families"][cf_name_str] = { + "name": cf_name_str, + "maxVersions": getattr(cf_descriptor, 'maxVersions', 1), + "compression": getattr(cf_descriptor, 'compression', 'NONE'), + "inMemory": getattr(cf_descriptor, 'inMemory', False), + "blockCacheEnabled": getattr(cf_descriptor, 'blockCacheEnabled', True), + "timeToLive": getattr(cf_descriptor, 'timeToLive', -1), + } + + return result + + except Exception as e: + self.report.failure( + message=f"Failed to get descriptor for table {full_table_name}", + exc=e + ) + return None + + def _convert_hbase_type_to_schema_field_type(self, hbase_type: str = "bytes") -> SchemaFieldDataTypeClass: + """ + Convert HBase data types to DataHub schema field types + HBase stores everything as bytes, but we provide common type mappings + """ + type_mapping = { + "string": StringTypeClass(), + "int": NumberTypeClass(), + "long": NumberTypeClass(), + "float": NumberTypeClass(), + "double": NumberTypeClass(), + "boolean": BooleanTypeClass(), + "bytes": BytesTypeClass(), + "array": ArrayTypeClass(nestedType=["bytes"]), + } + + return SchemaFieldDataTypeClass(type=type_mapping.get(hbase_type.lower(), BytesTypeClass())) + + def _generate_schema_fields(self, table_descriptor: Dict) -> List[SchemaField]: + """ + Generate schema fields from table descriptor + """ + schema_fields = [] + + # Add row key field (always present in HBase) + schema_fields.append( + SchemaField( + fieldPath="rowkey", + nativeDataType="bytes", + type=self._convert_hbase_type_to_schema_field_type("bytes"), + description="HBase row key", + nullable=False, + isPartOfKey=True, + ) + ) + + # Add column family fields + for cf_name, cf_props in table_descriptor.get("column_families", {}).items(): + schema_fields.append( + SchemaField( + fieldPath=cf_name, + nativeDataType="column_family", + type=SchemaFieldDataTypeClass(type=BytesTypeClass()), + description=f"Column family: {cf_name}", + nullable=True, + isPartOfKey=False, + ) + ) + + return schema_fields + + def _generate_namespace_container(self, namespace: str) -> Container: + """ + Generate container for HBase namespace + """ + namespace_container_key = self._generate_namespace_container_key(namespace) + + return Container( + namespace_container_key, + display_name=namespace, + qualified_name=namespace, + subtype=DatasetContainerSubTypes.SCHEMA, + description=f"HBase namespace: {namespace}", + ) + + def _generate_namespace_container_key(self, namespace: str) -> ContainerKey: + """ + Generate container key for namespace + """ + return NamespaceKey( + namespace=namespace, + platform=self.platform, + instance=self.config.platform_instance, + env=self.config.env, + ) + + def _generate_table_dataset( + self, namespace: str, table_name: str, table_descriptor: Dict + ) -> Optional[Dataset]: + """ + Generate dataset for HBase table + """ + # Full table name with namespace + if namespace == 'default': + full_table_name = table_name + dataset_name = table_name + else: + full_table_name = f"{namespace}:{table_name}" + dataset_name = f"{namespace}.{table_name}" + + self.report.report_entity_scanned(dataset_name, ent_type="table") + + if not self.config.table_pattern.allowed(dataset_name): + self.report.report_dropped(dataset_name) + return None + + # Generate schema fields + schema_fields = None + if self.config.include_column_families and table_descriptor: + try: + schema_fields = self._generate_schema_fields(table_descriptor) + except Exception as e: + self.report.warning( + message="Failed to generate schema fields", + context=dataset_name + ) + + # Generate custom properties + custom_properties = {} + if table_descriptor and "column_families" in table_descriptor: + custom_properties["column_families"] = str(len(table_descriptor["column_families"])) + for cf_name, cf_props in table_descriptor["column_families"].items(): + custom_properties[f"cf.{cf_name}.maxVersions"] = str(cf_props.get("maxVersions", "1")) + custom_properties[f"cf.{cf_name}.compression"] = str(cf_props.get("compression", "NONE")) + + return Dataset( + platform=self.platform, + name=dataset_name, + env=self.config.env, + platform_instance=self.config.platform_instance, + subtype=DatasetSubTypes.TABLE, + parent_container=self._generate_namespace_container_key(namespace), + schema=schema_fields, + display_name=table_name, + qualified_name=full_table_name, + description=f"HBase table in namespace '{namespace}'", + custom_properties=custom_properties, + ) + + def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, Entity]]: + """ + Main method to generate work units + """ + # Connect to HBase + if not self._connect(): + return + + # Get all namespaces + namespaces = self._get_namespaces() + + for namespace in namespaces: + # Check if namespace matches pattern + if not self.config.namespace_pattern.allowed(namespace): + self.report.report_dropped(namespace) + continue + + self.report.report_entity_scanned(namespace, ent_type="namespace") + + # Generate namespace container + yield self._generate_namespace_container(namespace) + + # Get tables in namespace + tables = self._get_tables_in_namespace(namespace) + + for table_name in tables: + try: + # Get full table name for HBase API + if namespace == 'default': + full_table_name = table_name + else: + full_table_name = f"{namespace}:{table_name}" + + # Get table descriptor + table_descriptor = self._get_table_descriptor(full_table_name) + + # Generate table dataset + dataset = self._generate_table_dataset( + namespace, table_name, table_descriptor + ) + + if dataset: + yield dataset + + except Exception as e: + self.report.num_tables_failed += 1 + self.report.failure( + message="Failed to process table", + context=f"{namespace}:{table_name}", + exc=e + ) + + def get_report(self) -> HBaseSourceReport: + """ + Return the ingestion report + """ + return self.report + + def close(self) -> None: + """ + Clean up resources + """ + if self.connection: + try: + # Close connection if it has a close method + if hasattr(self.connection, 'close'): + self.connection.close() + except Exception as e: + logger.warning(f"Error closing HBase connection: {e}") + + super().close() From fe1347297120d46ce37d9717cdcc29f6f8cf6f3c Mon Sep 17 00:00:00 2001 From: btkcodedev Date: Thu, 13 Nov 2025 08:38:42 +0530 Subject: [PATCH 2/5] chore: add missing frontend components, add docs add deps --- .../app/ingest/source/builder/constants.ts | 4 + .../app/ingest/source/builder/sources.json | 8 + .../source/builder/RecipeForm/constants.ts | 21 ++ .../source/builder/RecipeForm/hbase.ts | 95 ++++++++ .../app/ingestV2/source/builder/constants.ts | 4 + .../app/ingestV2/source/builder/sources.json | 8 + .../docs/sources/hbase/hbase_pre.md | 59 +++++ .../docs/sources/hbase/hbase_recipe.yml | 33 +++ metadata-ingestion/setup.py | 4 + .../src/datahub/ingestion/source/sql/hbase.py | 204 ++++++++++-------- 10 files changed, 350 insertions(+), 90 deletions(-) create mode 100644 datahub-web-react/src/app/ingestV2/source/builder/RecipeForm/hbase.ts create mode 100644 metadata-ingestion/docs/sources/hbase/hbase_pre.md create mode 100644 metadata-ingestion/docs/sources/hbase/hbase_recipe.yml diff --git a/datahub-web-react/src/app/ingest/source/builder/constants.ts b/datahub-web-react/src/app/ingest/source/builder/constants.ts index 35e3514dc2eaff..6fadc98ca4b110 100644 --- a/datahub-web-react/src/app/ingest/source/builder/constants.ts +++ b/datahub-web-react/src/app/ingest/source/builder/constants.ts @@ -17,6 +17,7 @@ import fivetranLogo from '@images/fivetranlogo.png'; import glueLogo from '@images/gluelogo.png'; import googleSheetsLogo from '@images/google-sheets-logo.png'; import grafanaLogo from '@images/grafana.png'; +import hbaseLogo from '@images/hbaselogo.png'; import hiveLogo from '@images/hivelogo.png'; import kafkaLogo from '@images/kafkalogo.png'; import lookerLogo from '@images/lookerlogo.svg'; @@ -75,6 +76,8 @@ export const GLUE = 'glue'; export const GLUE_URN = `urn:li:dataPlatform:${GLUE}`; export const GRAFANA = 'grafana'; export const GRAFANA_URN = `urn:li:dataPlatform:${GRAFANA}`; +export const HBASE = 'hbase'; +export const HBASE_URN = `urn:li:dataPlatform:${HBASE}`; export const HIVE = 'hive'; export const HIVE_URN = `urn:li:dataPlatform:${HIVE}`; export const KAFKA = 'kafka'; @@ -170,6 +173,7 @@ export const PLATFORM_URN_TO_LOGO = { [FEAST_URN]: feastLogo, [GLUE_URN]: glueLogo, [GRAFANA_URN]: grafanaLogo, + [HBASE_URN]: hbaseLogo, [HIVE_URN]: hiveLogo, [KAFKA_URN]: kafkaLogo, [LOOKER_URN]: lookerLogo, diff --git a/datahub-web-react/src/app/ingest/source/builder/sources.json b/datahub-web-react/src/app/ingest/source/builder/sources.json index 08788f9a8190bd..726e078f493202 100644 --- a/datahub-web-react/src/app/ingest/source/builder/sources.json +++ b/datahub-web-react/src/app/ingest/source/builder/sources.json @@ -111,6 +111,14 @@ "docsUrl": "https://docs.datahub.com/docs/generated/ingestion/sources/hive/", "recipe": "source: \n type: hive\n config:\n # Coordinates\n host_port: # Your Hive host and port, e.g. hive:10000\n database: # Your Hive database name, e.g. SampleDatabase (Optional, if not specified, ingests from all databases)\n\n # Credentials\n # Add secret in Secrets Tab with relevant names for each variable\n username: null # Your Hive username, e.g. admin\n stateful_ingestion:\n enabled: true" }, + { + "urn": "urn:li:dataPlatform:hbase", + "name": "hbase", + "displayName": "HBase", + "description": "Import Namespaces, Tables, Column Families, and metadata from Apache HBase.", + "docsUrl": "https://docs.datahub.com/docs/generated/ingestion/sources/hbase/", + "recipe": "source: \n type: hbase\n config:\n # Coordinates\n host: localhost # Your HBase Thrift server hostname\n port: 9090 # Your HBase Thrift server port\n\n # Optional configurations\n use_ssl: false\n auth_mechanism: null # Options: null, KERBEROS, or custom\n\n # Schema extraction\n include_column_families: true\n max_column_qualifiers: 100\n\n stateful_ingestion:\n enabled: true" + }, { "urn": "urn:li:dataPlatform:presto", "name": "presto", diff --git a/datahub-web-react/src/app/ingestV2/source/builder/RecipeForm/constants.ts b/datahub-web-react/src/app/ingestV2/source/builder/RecipeForm/constants.ts index 1197ec6b4e00fa..e521186efd9f00 100644 --- a/datahub-web-react/src/app/ingestV2/source/builder/RecipeForm/constants.ts +++ b/datahub-web-react/src/app/ingestV2/source/builder/RecipeForm/constants.ts @@ -73,6 +73,16 @@ import { TARGET_PLATFORM, TARGET_PLATFORM_INSTANCE, } from '@app/ingestV2/source/builder/RecipeForm/dbt_cloud'; +import { + HBASE_AUTH_MECHANISM, + HBASE_HOST, + HBASE_INCLUDE_COLUMN_FAMILIES, + HBASE_MAX_COLUMN_QUALIFIERS, + HBASE_PORT, + HBASE_USE_SSL, + NAMESPACE_ALLOW, + NAMESPACE_DENY, +} from '@app/ingestV2/source/builder/RecipeForm/hbase'; import { HIVE_DATABASE, HIVE_HOST_PORT, @@ -235,6 +245,7 @@ import { CSV, DATABRICKS, DBT_CLOUD, + HBASE, MYSQL, OKTA, POWER_BI, @@ -423,6 +434,16 @@ export const RECIPE_FIELDS: RecipeFields = { advancedFields: [INCLUDE_TABLES, TABLE_PROFILING_ENABLED, COLUMN_PROFILING_ENABLED, STATEFUL_INGESTION_ENABLED], filterSectionTooltip: 'Include or exclude specific Schemas, Tables and Views from ingestion.', }, + [HBASE]: { + fields: [HBASE_HOST, HBASE_PORT, HBASE_USE_SSL, HBASE_AUTH_MECHANISM], + filterFields: [NAMESPACE_ALLOW, NAMESPACE_DENY, TABLE_ALLOW, TABLE_DENY], + advancedFields: [ + HBASE_INCLUDE_COLUMN_FAMILIES, + HBASE_MAX_COLUMN_QUALIFIERS, + STATEFUL_INGESTION_ENABLED, + ], + filterSectionTooltip: 'Include or exclude specific Namespaces and Tables from ingestion.', + }, [PRESTO]: { fields: [PRESTO_HOST_PORT, PRESTO_USERNAME, PRESTO_PASSWORD, PRESTO_DATABASE], filterFields: [SCHEMA_ALLOW, SCHEMA_DENY, TABLE_ALLOW, TABLE_DENY, VIEW_ALLOW, VIEW_DENY], diff --git a/datahub-web-react/src/app/ingestV2/source/builder/RecipeForm/hbase.ts b/datahub-web-react/src/app/ingestV2/source/builder/RecipeForm/hbase.ts new file mode 100644 index 00000000000000..bfb331570ed026 --- /dev/null +++ b/datahub-web-react/src/app/ingestV2/source/builder/RecipeForm/hbase.ts @@ -0,0 +1,95 @@ +import { RecipeField, FieldType, setListValuesOnRecipe } from './common'; + +export const HBASE_HOST: RecipeField = { + name: 'host', + label: 'Host', + tooltip: 'HBase Thrift server hostname or IP address', + type: FieldType.TEXT, + fieldPath: 'source.config.host', + placeholder: 'localhost', + required: true, + rules: null, +}; + +export const HBASE_PORT: RecipeField = { + name: 'port', + label: 'Port', + tooltip: 'HBase Thrift server port (default: 9090 for Thrift1)', + type: FieldType.TEXT, + fieldPath: 'source.config.port', + placeholder: '9090', + required: false, + rules: null, +}; + +export const HBASE_USE_SSL: RecipeField = { + name: 'use_ssl', + label: 'Use SSL', + tooltip: 'Whether to use SSL/TLS for connection', + type: FieldType.BOOLEAN, + fieldPath: 'source.config.use_ssl', + required: false, + rules: null, +}; + +export const HBASE_AUTH_MECHANISM: RecipeField = { + name: 'auth_mechanism', + label: 'Authentication Mechanism', + tooltip: 'Authentication mechanism (None, KERBEROS, or custom)', + type: FieldType.TEXT, + fieldPath: 'source.config.auth_mechanism', + placeholder: 'KERBEROS', + required: false, + rules: null, +}; + +export const NAMESPACE_ALLOW: RecipeField = { + name: 'namespace_pattern.allow', + label: 'Allow Patterns for Namespace', + tooltip: + 'Use regex here. e.g. to allow all namespaces, use ".*" or to allow namespaces starting with "production" use "production.*"', + placeholder: '.*', + type: FieldType.LIST, + buttonLabel: 'Add pattern', + fieldPath: 'source.config.namespace_pattern.allow', + rules: null, + section: 'Namespaces', + setValueOnRecipeOverride: (recipe: any, values: string[]) => + setListValuesOnRecipe(recipe, values, 'source.config.namespace_pattern.allow'), +}; + +export const NAMESPACE_DENY: RecipeField = { + name: 'namespace_pattern.deny', + label: 'Deny Patterns for Namespace', + tooltip: + 'Use regex here. Deny patterns take precedence over allow patterns. e.g. to deny all system namespaces, use "system.*"', + placeholder: 'system.*', + type: FieldType.LIST, + buttonLabel: 'Add pattern', + fieldPath: 'source.config.namespace_pattern.deny', + rules: null, + section: 'Namespaces', + setValueOnRecipeOverride: (recipe: any, values: string[]) => + setListValuesOnRecipe(recipe, values, 'source.config.namespace_pattern.deny'), +}; + +export const HBASE_INCLUDE_COLUMN_FAMILIES: RecipeField = { + name: 'include_column_families', + label: 'Include Column Families', + tooltip: 'Whether to include column families as schema metadata', + type: FieldType.BOOLEAN, + fieldPath: 'source.config.include_column_families', + required: false, + rules: null, +}; + +export const HBASE_MAX_COLUMN_QUALIFIERS: RecipeField = { + name: 'max_column_qualifiers', + label: 'Max Column Qualifiers', + tooltip: 'Maximum number of column qualifiers to sample per column family', + type: FieldType.TEXT, + fieldPath: 'source.config.max_column_qualifiers', + placeholder: '100', + required: false, + rules: null, +}; diff --git a/datahub-web-react/src/app/ingestV2/source/builder/constants.ts b/datahub-web-react/src/app/ingestV2/source/builder/constants.ts index c7082faf0194c9..b9f8adc945f48f 100644 --- a/datahub-web-react/src/app/ingestV2/source/builder/constants.ts +++ b/datahub-web-react/src/app/ingestV2/source/builder/constants.ts @@ -15,6 +15,7 @@ import elasticsearchLogo from '@images/elasticsearchlogo.png'; import feastLogo from '@images/feastlogo.png'; import fivetranLogo from '@images/fivetranlogo.png'; import glueLogo from '@images/gluelogo.png'; +import hbaseLogo from '@images/hbaselogo.png'; import hiveLogo from '@images/hivelogo.png'; import kafkaLogo from '@images/kafkalogo.png'; import lookerLogo from '@images/lookerlogo.svg'; @@ -69,6 +70,8 @@ export const FEAST_LEGACY = 'feast-legacy'; export const FEAST_URN = `urn:li:dataPlatform:${FEAST}`; export const GLUE = 'glue'; export const GLUE_URN = `urn:li:dataPlatform:${GLUE}`; +export const HBASE = 'hbase'; +export const HBASE_URN = `urn:li:dataPlatform:${HBASE}`; export const HIVE = 'hive'; export const HIVE_URN = `urn:li:dataPlatform:${HIVE}`; export const KAFKA = 'kafka'; @@ -159,6 +162,7 @@ export const PLATFORM_URN_TO_LOGO = { [ELASTICSEARCH_URN]: elasticsearchLogo, [FEAST_URN]: feastLogo, [GLUE_URN]: glueLogo, + [HBASE_URN]: hbaseLogo, [HIVE_URN]: hiveLogo, [KAFKA_URN]: kafkaLogo, [LOOKER_URN]: lookerLogo, diff --git a/datahub-web-react/src/app/ingestV2/source/builder/sources.json b/datahub-web-react/src/app/ingestV2/source/builder/sources.json index aa7543d4b21998..7714db65874e78 100644 --- a/datahub-web-react/src/app/ingestV2/source/builder/sources.json +++ b/datahub-web-react/src/app/ingestV2/source/builder/sources.json @@ -111,6 +111,14 @@ "docsUrl": "https://docs.datahub.com/docs/generated/ingestion/sources/hive/", "recipe": "source: \n type: hive\n config:\n # Coordinates\n host_port: # Your Hive host and port, e.g. hive:10000\n database: # Your Hive database name, e.g. SampleDatabase (Optional, if not specified, ingests from all databases)\n\n # Credentials\n # Add secret in Secrets Tab with relevant names for each variable\n username: null # Your Hive username, e.g. admin\n stateful_ingestion:\n enabled: true" }, + { + "urn": "urn:li:dataPlatform:hbase", + "name": "hbase", + "displayName": "HBase", + "description": "Import Namespaces, Tables, Column Families, and metadata from Apache HBase.", + "docsUrl": "https://docs.datahub.com/docs/generated/ingestion/sources/hbase/", + "recipe": "source: \n type: hbase\n config:\n # Coordinates\n host: localhost # Your HBase Thrift server hostname\n port: 9090 # Your HBase Thrift server port\n\n # Optional configurations\n use_ssl: false\n auth_mechanism: null # Options: null, KERBEROS, or custom\n\n # Schema extraction\n include_column_families: true\n max_column_qualifiers: 100\n\n stateful_ingestion:\n enabled: true" + }, { "urn": "urn:li:dataPlatform:presto", "name": "presto", diff --git a/metadata-ingestion/docs/sources/hbase/hbase_pre.md b/metadata-ingestion/docs/sources/hbase/hbase_pre.md new file mode 100644 index 00000000000000..4e6d1d111f91bd --- /dev/null +++ b/metadata-ingestion/docs/sources/hbase/hbase_pre.md @@ -0,0 +1,59 @@ +### Setup + +This integration extracts metadata from Apache HBase via the Thrift API, including information about: + +- Namespaces +- Tables +- Column families and their properties +- Table configurations + +You'll need to have HBase Thrift server running and accessible with appropriate permissions. + +#### Prerequisites + +1. **Install Required Python Packages**: + + ```bash + pip install 'acryl-datahub[hbase]' + ``` + + This will install `happybase` and `thrift` packages required for connecting to HBase. + +2. **HBase Thrift Server**: + + - Ensure the HBase Thrift server is running (typically on port 9090). + - Start the Thrift server if not already running: + ```bash + hbase thrift start -p 9090 + ``` + +3. **Network Access**: + + - The host running DataHub ingestion must have network access to the HBase Thrift server. + - Verify connectivity: + ```bash + telnet 9090 + ``` + +4. **Permissions**: + - The user/service account must have read access to: + - System tables for metadata extraction + - Target namespaces and tables you want to ingest + +#### Authentication + +- **No Authentication**: By default, the connector uses no authentication. +- **Kerberos**: Set `auth_mechanism: "KERBEROS"` in the configuration. +- **Custom Authentication**: Specify your authentication mechanism in the `auth_mechanism` field. + +:::note + +For production deployments, it's recommended to use secure authentication mechanisms and SSL/TLS connections. + +::: + +:::caution + +The connector samples column qualifiers to extract schema information. For tables with many column qualifiers, adjust the `max_column_qualifiers` parameter to control the sampling size and avoid performance issues. + +::: diff --git a/metadata-ingestion/docs/sources/hbase/hbase_recipe.yml b/metadata-ingestion/docs/sources/hbase/hbase_recipe.yml new file mode 100644 index 00000000000000..05dc123149094c --- /dev/null +++ b/metadata-ingestion/docs/sources/hbase/hbase_recipe.yml @@ -0,0 +1,33 @@ +source: + type: hbase + config: + # Coordinates + host: localhost + port: 9090 # default Thrift port + + # Optional configurations + use_ssl: false + auth_mechanism: null # Options: null, KERBEROS, or custom + + # Filtering patterns + namespace_pattern: + allow: + - ".*" + deny: + - "system" + table_pattern: + allow: + - ".*" + # deny: + # - "temp_.*" + + # Schema extraction options + include_column_families: true + max_column_qualifiers: 100 + + # Environment and platform instance + env: "PROD" + platform_instance: null # Optional: specify if you have multiple HBase instances + +sink: + # sink configs diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index f955cc75b0a367..6876cb07982379 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -508,6 +508,10 @@ "sqlalchemy-hana>=0.5.0; platform_machine != 'aarch64' and platform_machine != 'arm64'", "hdbcli>=2.11.20; platform_machine != 'aarch64' and platform_machine != 'arm64'", }, + "hbase": { + "happybase>=1.2.0", + "thrift>=0.13.0", + }, "hive": sql_common | pyhive_common | { diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/hbase.py b/metadata-ingestion/src/datahub/ingestion/source/sql/hbase.py index 5a1a83e99d1948..5316cfe69a2b23 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/hbase.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/hbase.py @@ -1,13 +1,13 @@ """ HBase Source for DataHub Metadata Ingestion """ + import logging from typing import Dict, Iterable, List, Optional, Union -import pydantic from pydantic import Field -from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance +from datahub.configuration.common import AllowDenyPattern, ConfigModel from datahub.emitter.mcp_builder import ContainerKey from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.decorators import ( @@ -43,7 +43,6 @@ from datahub.sdk.container import Container from datahub.sdk.dataset import Dataset from datahub.sdk.entity import Entity -from datahub.configuration.common import AllowDenyPattern, ConfigModel logger = logging.getLogger(__name__) @@ -121,7 +120,12 @@ def report_dropped(self, name: str) -> None: else: self.dropped_namespaces.append(name) - def failure(self, message: str, context: Optional[str] = None, exc: Optional[Exception] = None) -> None: + def failure( + self, + message: str, + context: Optional[str] = None, + exc: Optional[Exception] = None, + ) -> None: failure_entry = {"message": message} if context: failure_entry["context"] = context @@ -203,40 +207,44 @@ def _connect(self) -> bool: try: # Import HBase Thrift libraries # Note: This requires happybase or similar HBase Python client - from thrift.transport import TSocket, TTransport - from thrift.protocol import TBinaryProtocol from hbase import Hbase + from thrift.protocol import TBinaryProtocol + from thrift.transport import TSocket, TTransport # Create socket transport = TSocket.TSocket(self.config.host, self.config.port) - + # Wrap in buffered transport transport = TTransport.TBufferedTransport(transport) - + # Use binary protocol protocol = TBinaryProtocol.TBinaryProtocol(transport) - + # Create client self.connection = Hbase.Client(protocol) - + # Open connection transport.open() - - logger.info(f"Successfully connected to HBase at {self.config.host}:{self.config.port}") + + logger.info( + f"Successfully connected to HBase at {self.config.host}:{self.config.port}" + ) return True - + except ImportError: self.report.failure( message="Failed to import HBase Thrift libraries. Please install 'happybase' or 'hbase-thrift' package.", - context="connection" + context="connection", + ) + logger.error( + "HBase Thrift libraries not found. Install with: pip install happybase" ) - logger.error("HBase Thrift libraries not found. Install with: pip install happybase") return False except Exception as e: self.report.failure( message="Failed to connect to HBase", context=f"{self.config.host}:{self.config.port}", - exc=e + exc=e, ) return False @@ -249,23 +257,22 @@ def _get_namespaces(self) -> List[str]: # We'll get all tables and extract namespaces from table names # Table names in HBase can be namespace:table or just table (default namespace) tables = self.connection.getTableNames() - + namespaces = set() for table in tables: - table_str = table.decode('utf-8') if isinstance(table, bytes) else str(table) - if ':' in table_str: - namespace = table_str.split(':', 1)[0] + table_str = ( + table.decode("utf-8") if isinstance(table, bytes) else str(table) + ) + if ":" in table_str: + namespace = table_str.split(":", 1)[0] namespaces.add(namespace) else: - namespaces.add('default') - + namespaces.add("default") + return sorted(list(namespaces)) - + except Exception as e: - self.report.failure( - message="Failed to get namespaces from HBase", - exc=e - ) + self.report.failure(message="Failed to get namespaces from HBase", exc=e) return [] def _get_tables_in_namespace(self, namespace: str) -> List[str]: @@ -275,27 +282,28 @@ def _get_tables_in_namespace(self, namespace: str) -> List[str]: try: all_tables = self.connection.getTableNames() namespace_tables = [] - + for table in all_tables: - table_str = table.decode('utf-8') if isinstance(table, bytes) else str(table) - - if namespace == 'default': + table_str = ( + table.decode("utf-8") if isinstance(table, bytes) else str(table) + ) + + if namespace == "default": # Default namespace tables don't have namespace prefix - if ':' not in table_str: + if ":" not in table_str: namespace_tables.append(table_str) else: # Check if table belongs to this namespace if table_str.startswith(f"{namespace}:"): # Remove namespace prefix for table name - table_name = table_str.split(':', 1)[1] + table_name = table_str.split(":", 1)[1] namespace_tables.append(table_name) - + return namespace_tables - + except Exception as e: self.report.failure( - message=f"Failed to get tables for namespace {namespace}", - exc=e + message=f"Failed to get tables for namespace {namespace}", exc=e ) return [] @@ -305,40 +313,49 @@ def _get_table_descriptor(self, full_table_name: str) -> Optional[Dict]: """ try: # Convert to bytes if string - table_bytes = full_table_name.encode('utf-8') if isinstance(full_table_name, str) else full_table_name - + table_bytes = ( + full_table_name.encode("utf-8") + if isinstance(full_table_name, str) + else full_table_name + ) + # Get column descriptors descriptors = self.connection.getColumnDescriptors(table_bytes) - + # Convert to dict structure - result = { - "column_families": {} - } - + result = {"column_families": {}} + for cf_name, cf_descriptor in descriptors.items(): - cf_name_str = cf_name.decode('utf-8') if isinstance(cf_name, bytes) else str(cf_name) + cf_name_str = ( + cf_name.decode("utf-8") + if isinstance(cf_name, bytes) + else str(cf_name) + ) # Remove trailing colon if present - cf_name_str = cf_name_str.rstrip(':') - + cf_name_str = cf_name_str.rstrip(":") + result["column_families"][cf_name_str] = { "name": cf_name_str, - "maxVersions": getattr(cf_descriptor, 'maxVersions', 1), - "compression": getattr(cf_descriptor, 'compression', 'NONE'), - "inMemory": getattr(cf_descriptor, 'inMemory', False), - "blockCacheEnabled": getattr(cf_descriptor, 'blockCacheEnabled', True), - "timeToLive": getattr(cf_descriptor, 'timeToLive', -1), + "maxVersions": getattr(cf_descriptor, "maxVersions", 1), + "compression": getattr(cf_descriptor, "compression", "NONE"), + "inMemory": getattr(cf_descriptor, "inMemory", False), + "blockCacheEnabled": getattr( + cf_descriptor, "blockCacheEnabled", True + ), + "timeToLive": getattr(cf_descriptor, "timeToLive", -1), } - + return result - + except Exception as e: self.report.failure( - message=f"Failed to get descriptor for table {full_table_name}", - exc=e + message=f"Failed to get descriptor for table {full_table_name}", exc=e ) return None - def _convert_hbase_type_to_schema_field_type(self, hbase_type: str = "bytes") -> SchemaFieldDataTypeClass: + def _convert_hbase_type_to_schema_field_type( + self, hbase_type: str = "bytes" + ) -> SchemaFieldDataTypeClass: """ Convert HBase data types to DataHub schema field types HBase stores everything as bytes, but we provide common type mappings @@ -353,15 +370,17 @@ def _convert_hbase_type_to_schema_field_type(self, hbase_type: str = "bytes") -> "bytes": BytesTypeClass(), "array": ArrayTypeClass(nestedType=["bytes"]), } - - return SchemaFieldDataTypeClass(type=type_mapping.get(hbase_type.lower(), BytesTypeClass())) + + return SchemaFieldDataTypeClass( + type=type_mapping.get(hbase_type.lower(), BytesTypeClass()) + ) def _generate_schema_fields(self, table_descriptor: Dict) -> List[SchemaField]: """ Generate schema fields from table descriptor """ schema_fields = [] - + # Add row key field (always present in HBase) schema_fields.append( SchemaField( @@ -373,9 +392,9 @@ def _generate_schema_fields(self, table_descriptor: Dict) -> List[SchemaField]: isPartOfKey=True, ) ) - + # Add column family fields - for cf_name, cf_props in table_descriptor.get("column_families", {}).items(): + for cf_name, _cf_props in table_descriptor.get("column_families", {}).items(): schema_fields.append( SchemaField( fieldPath=cf_name, @@ -386,7 +405,7 @@ def _generate_schema_fields(self, table_descriptor: Dict) -> List[SchemaField]: isPartOfKey=False, ) ) - + return schema_fields def _generate_namespace_container(self, namespace: str) -> Container: @@ -394,7 +413,7 @@ def _generate_namespace_container(self, namespace: str) -> Container: Generate container for HBase namespace """ namespace_container_key = self._generate_namespace_container_key(namespace) - + return Container( namespace_container_key, display_name=namespace, @@ -421,38 +440,43 @@ def _generate_table_dataset( Generate dataset for HBase table """ # Full table name with namespace - if namespace == 'default': + if namespace == "default": full_table_name = table_name dataset_name = table_name else: full_table_name = f"{namespace}:{table_name}" dataset_name = f"{namespace}.{table_name}" - + self.report.report_entity_scanned(dataset_name, ent_type="table") - + if not self.config.table_pattern.allowed(dataset_name): self.report.report_dropped(dataset_name) return None - + # Generate schema fields schema_fields = None if self.config.include_column_families and table_descriptor: try: schema_fields = self._generate_schema_fields(table_descriptor) - except Exception as e: + except Exception: self.report.warning( - message="Failed to generate schema fields", - context=dataset_name + message="Failed to generate schema fields", context=dataset_name ) - + # Generate custom properties custom_properties = {} if table_descriptor and "column_families" in table_descriptor: - custom_properties["column_families"] = str(len(table_descriptor["column_families"])) + custom_properties["column_families"] = str( + len(table_descriptor["column_families"]) + ) for cf_name, cf_props in table_descriptor["column_families"].items(): - custom_properties[f"cf.{cf_name}.maxVersions"] = str(cf_props.get("maxVersions", "1")) - custom_properties[f"cf.{cf_name}.compression"] = str(cf_props.get("compression", "NONE")) - + custom_properties[f"cf.{cf_name}.maxVersions"] = str( + cf_props.get("maxVersions", "1") + ) + custom_properties[f"cf.{cf_name}.compression"] = str( + cf_props.get("compression", "NONE") + ) + return Dataset( platform=self.platform, name=dataset_name, @@ -474,49 +498,49 @@ def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, Entity]]: # Connect to HBase if not self._connect(): return - + # Get all namespaces namespaces = self._get_namespaces() - + for namespace in namespaces: # Check if namespace matches pattern if not self.config.namespace_pattern.allowed(namespace): self.report.report_dropped(namespace) continue - + self.report.report_entity_scanned(namespace, ent_type="namespace") - + # Generate namespace container yield self._generate_namespace_container(namespace) - + # Get tables in namespace tables = self._get_tables_in_namespace(namespace) - + for table_name in tables: try: # Get full table name for HBase API - if namespace == 'default': + if namespace == "default": full_table_name = table_name else: full_table_name = f"{namespace}:{table_name}" - + # Get table descriptor table_descriptor = self._get_table_descriptor(full_table_name) - + # Generate table dataset dataset = self._generate_table_dataset( namespace, table_name, table_descriptor ) - + if dataset: yield dataset - + except Exception as e: self.report.num_tables_failed += 1 self.report.failure( message="Failed to process table", context=f"{namespace}:{table_name}", - exc=e + exc=e, ) def get_report(self) -> HBaseSourceReport: @@ -532,9 +556,9 @@ def close(self) -> None: if self.connection: try: # Close connection if it has a close method - if hasattr(self.connection, 'close'): + if hasattr(self.connection, "close"): self.connection.close() except Exception as e: logger.warning(f"Error closing HBase connection: {e}") - + super().close() From 62adb62368ada31a088859be1ab7310d3f9b454b Mon Sep 17 00:00:00 2001 From: btkcodedev Date: Thu, 13 Nov 2025 08:43:01 +0530 Subject: [PATCH 3/5] chore: resize logo to square --- datahub-web-react/src/images/hbaselogo.png | Bin 21196 -> 4488 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/datahub-web-react/src/images/hbaselogo.png b/datahub-web-react/src/images/hbaselogo.png index e91eb8d7401ad07cf9a7ac7e062200e6eb488929..af2013b8c267e35343690cb7a5bdcdd140a44247 100644 GIT binary patch literal 4488 zcmV;35qIv1P)P#Kgp*p`qDUQ{jDjD=RC|Eh(o{ zhV}pe5e-R1K~#9!?45~%qaX}GIcl^8TKfM#yPOIhcvahVLT6{TMUFmW2q7Y=^F;E5 zxu42K90(dgV=jV5&2zqHQ0R(+QE)fkw(8uSh#v}xNc&_8zNCbUwE@=?-DY?{x7MBG5JH3P;(2cGE zMfkwnprH};;h=-&XqtW|g8pPKK;+yg1btv`&0Ej{K_3ZPwxAh;J`%KOK{seP9|@Xv zpgDp*N1_#iJ_j^K(C4TQjiAqw=mrq<8Rp%4=Nub5$K>toJGtiB!r^|x% zsiJ^y&E*+rGdqkhq3<KAv zPO1Sx7js4PnUsf=A?U+EH!Kxu1brOnM$3*gf-WwP?$Sf5pptcbdeW9w^FzdGr+KBsS|q|LGM=3Dz6#F zt`hna&b(g$y;(u)Y)S(5e_R@U?JMW3B@GCAkAh}-&2={lYNklClAwOTYx;fAd@tt~ zbUwFas&l-m8bOn-3R?GTc4Hy{m!g$qIW+SAEa=S(g(R;zkIz$Va8~|yH0gE)t@>*y zb{tenNiNBV=4XG~@xMnwLu%8hbc)hwt!2&i&=CBqpaIx+zg&k|x>`h>hI6H?KB8>Q zJN>6XlQ=kS_d!R~7`(ZxU5HW@LVT@zn+rX@;_`<;LmZrT_(n+_0j)G^IlNZvEWF2> z{WZ}3Laza~PYHuvT!V^ErD3~Hb?d@>{pz53yZEtLacHkN?0BbTA>tq0nfR*L5U#dfm`ht+>)!6jbH+ovjSAa$$eVV!EO zU75y;LC*TRpji~?U5dkLARds-whP+NV!UN2;ZHd8qEv2Qy>ihutYKuK`+^92xbs0t z8qE*5I_NM0D%ckl!>qbp)GqjAb(|SGi?53|y*lXaBO&#i#nQ{)TsoRL(s49asZZ*% zpk>cdyvYl#vo(`qyLP^B22M-bJFx9+Qu`Uxm7_^LXib_Oa~0X&+^X2Fwzf%4nIZ1X z!gfSf*Y=A#&~jHmqh>4G^-9JlYG#`D_DUqpFAsWyKr_iht(+I;+=F&$P!UF^sj<8P z5-U?*9yDJm(AGv!qL*m+t4i&Pb`8tOw2%$&=IhKWkIRB4b{@Reo>O|>lQ$8_&>7p3 zGDaX<6uvNMyQMA>E9WPAiEDb;ZpD*Z4$bV)PgByfATPWMUNa3V&AHj=j;VHacf+#L z*4j)ZA=%7^_k+bQ23kq=NzMhS`WlNk3GHBCMIOFSQwpq&u2#sxgqkOVXF_CX{URvpvEvYT2YD_pu>qBsg` zW;z(lX{dVajkq}IDL=H1-uUEz*?=)kO6^KNR8Fm30lhNlslko4_dj$FR5`1lXM&IP z%!B0Eg+Wg&ed{C|L$|{mUli{@-L7&fywEF?f7xhKF}-S%t#vhnD8<1=Ft?Ot3^-Yr zdSz_-`fOX^G|+k?k}mNB1!guv9EPO)Ok|Y4qt86*v(EI+~62A;u zO$Ee6Q$Po4RdeDD1VI;a^>m*f7o#CtDwnbP`OtuI=r8|Bl&+o`Bl)W z2s&CM1g0tT{|4n(LDK^0PF)!5S5Q6NPuKb^Dg?cKA25*!yK|Fj=LX_R7~UpPK^6J$ zP<|8iluD2g?+Y|@X1{bh+~^!ZFVE!*7F3*c)4>0KM)^(9ldG~(pj#=n;YgxJ&>M5v zOTf81 z7&-gal1qZ7(>^7qoVx`o1icINm^)guK7TF0f1l+=K+93B;eGx(3o1g;yFl{^Z=KXy z)|Zj|5w0CH%k%OvpW#R;!N#*Ol(U#NVn_Bz*NqD((8`#bC%6AF2gp>JhP3D5zMLnw-PSF)m z%XTM4`|M&NRIy=h4x!x+jMSEpbM1yE!ar)=BW3X?${*MklhfZwnf!E971Jj*3ECY^ z+wCLm8!FBK4WKAvl!af^{Pp)xO@e0a;r3|iN~lu1Ktw1Lir6N2!Fk>G$@j;#Py%fZ z=^iu$#2j7%-G13nHD%|3mX;M|v?F2uOqZ>=JKmoZ56%8n)9o5;;dFFg(R{gr+BJQO zonkDU)52<`Fb8L#=?v($?(mh*01Y5)V}kn2)LlvD16h)2+fPBg)^Hc0tzr5eO>iZG zqVX^LMdc*uwoks0KfWl@8$i=guWRH4&{ja1$_xqO1+*a1COfqu>_x%kRJ%@~8P5ho z<(ildHj90#2Wd*8$r{k6vs(wGti8qIrp~zXH0K zVY(vf8HvtPQFRPK+YJHdC3-n%tI>tB4^Fkelzls+qutS6s?XBV!u3(c zdP;Z%bXU7g3v~24(B4do7V7+9$kHUvlIUK$tO&ZEC{UL{vq6c^sSXhhwrF%Nv^sn` z33N1zdyNY+XGiD9Q$7QFSmfiN6O(NobTH@rmxZ$=dLV9eMxt4-h~V`XCEB73R=Kr< z1lkPVYe3U%u%N#58K8L|G&@?NDJz18fuRy7qYFTHl}~~m*jT54 zW{l@DTLf)gm`tq%%~pc0iE6FfFcq1WXm?h4BGGICXvPx3J_R%wEmifDgH?%>`#2M$ z)X18Vq_XwpLA~}PQZ5V5rr`^8wCS(uQ5v-!D<7@d89%v>CTHmA5tL;^!VK4Im-6y8 z!o_914P;mJpnZVC#4;ePctU4oQnyrS>^#`LGXMq!U|a`+p3JRc8oSV{JgutP)VQ?l zV3tvCcX93+pyA=C#Sac;J!_6IFO%%hVcyxd{xkP^{=-8qpWL_@c$V5q+#e1jeYU3~ zX>qJwSJNYud^`q6@T6l{{Y0w{7)SXxDu74%2bh!$Edew99**1N!EBJP$f_ z(0~ST?{4jDB!w)K-j9@YMPB#rbi`x2QW4<&m3l9sdc4DNw|k*k^XWTWg7^HKq@Bt$ z&H^1;TLqdtN8yg9n>~rpaem%z%ZfddaH{HYU)W*?dT|Gz09OF*E|)0N6?xwlmwL84 zQY`1(fxfK(eS$>u7IbwOXv2(GQ8L$D3a`-T6{n{KO>Rd@^ag))3wow(2ioSoSAo8J zHQ>(cZQc3t#nznHx6@M5Bxuf7L)&s`diYmJ&wJjh zwwC*ALCQhj?!C?p6aRgQzP-tCb_>bu_K|~rwdaQvXU5%e?z5IVUoxu(l7W7H!cXvR ziN3w7qK~nv+B4ElBBJTyR{CMowOzqGrIawM{H+u5r(d=+QAa?3)r(YZl;{@pj5D`- zrl6x}+6$q0`v0|euDgwcKomv-PlbPgc>kAOV{`QqdmvlnO^y?#O%d4sGJ%=lKz}2^a2-#r*Xw(D#15Zh_|2Qt|sz z4Te-m3`?sdIyo!cfMxsCQ|_A|u$Z}CM`tkInY}Ybj(G2*s80(+$r8t~(<`ht}QzUto;VVIFc?Qi+kp$5EY~dDhFB3qI z=c7A7_b2`>Er-fU_ns^SL<5O`!3Mod@%E;GR)&s{TdOh9`;?*XGSP1Yy_2SQEP5j? zPZRGKoricdXuf+_f>vDr&i9&mKH?nm9_k^OCA|EH0hTirY90`wz9j$@#S4?5nxEkQdOR$L|J zw2ue&hxItN3mFCnt7A-#pbm}@x$^@lHT{q{F zcdtYJ4Z3;Fx*v4WI(;5!CrYaf-~V#hF@@x;ZPCqZK3Ozz6wvjIE`KmQlB*AwgehnI zBR$=o7xI(-hn~RI?l&8BZP6t5#`cCyc#Xe>ziv;}bnmvewyKueN$=PHL3_sxb2>p& zZP6q-!Wlglu&CLx-G44}ROiTpM*HaKBCA%@FS1Hy2(GH1>h$KxYSMn~on6GZMfXfk zs37xunwoTdYZkmW_1|(PT302U8r0zW!S9J8$7KrWI5#^1^meMg#Qz6@R*5sU0zM9C z$)Cy!lJ^IKR=*0`kLorp@Mc@IK7&^GL$5j}XXBc6#r|-n$adG@7coG`HxbH07{_5GPwlC|a< z_Zo9d4y`C#5JVYiaa9ly&|ko7PZ%iRUnQRf0Pr8Ei>j0;NX<0CG4KW! zAg$v9Bwqge3o1&T=?MY?L1Ck=?W(OH&uivr&uC)qXllXeY5x}r0>baf3%sTAUiHzhQ64$Q+WZDYKBw~)v79<>u?2OE0f^Z}xB>c|imb|Ls zlK+MS?*z!KU0ngZOiUgg9*iDrjE>G$Oe{P+JWR~2OsuR7KneyIF9%l>PX-4U@_)VL z|GY=s!o|$l2Hgd~{97!bfJ}d%FtIQ) zGyQKc3s0N>AK2d~|GzMEv;Qj@6HzU4S^2|CJrEZ~rgw|7oPB z3E*GM{^`Vj$;SV;<-DpEE{=9?f4e}#!NygPmH!`s{=52rdgNayF-LoVvxSQb5GKg+ zZ^(bv{Wrb#|F99{Vg5JZKP&$mpyX@=l+@&Jy98POP5+;D|4py+zcKjF%KrfT?Rs8i z8&3;6ZE+i5hyEi4I~TB7|Lc?gMv6JwIXbHWOw26)w(K9se^mW9^*?-c{)Z0>>;J>& zKc4&t(%g*K#m&^#!p!wQ#tS%V|IB_-6Dy|wsOD$-U#apwB8a-(y*K?_zeG4I8A$-sia$jGFb=Fz?JY&<@< z3EI;pn?2qW&IS&HSL=+y(-qcce+6 z6DFv1)DX3)AWJ3-l~$_AC2N>$9`oOVQh;N8!v>(3;3Vk~SmU@WX;97nw@svkeYJ|T+|C^WSZwFOF~6O zWfdxzH(wba?s;O-|GBPp~9*gPFU=g~5JR~wR$VEYv%n+XQeI87* zgo!Y)kUZfSgP|kkOJ0$}8o=}^O@SlxGgGm!TvIErp=fxG5S?CQ8+w{3|7j`M6^80T z6kp9z7WoI6w`(FCS%DH44ee!20>x#rP@>RD!N6cI#IZ(Manofo7#>io(^z6*oTu9S zFR7^!LMfm^tx+r-<)VTJ?`Fpnzht`qnKhxQB}CKkTCOqZ)Y+-e2r*qWLuSrv3KkH~ zm?-!=n63zsZ3^;4-$WuPAV33^sr3Esm&`uiZf7p0`@jP_yzfu!$z$iC+{3B4w#%1t zGGWxlF`+P^M8UvKwLEz#NMJ=s1V7(y-Q`jlyDQ4dq|X=$!~gvGGeD(~aYhnEH42Rc zugZi0MEotZj8Hox;s{kuL4>ZPZ8CVgu7B1K6l|p($k7xP6$KmYRvY}0qE+Mna`;;+ zz@WsZWH7kGX;=FD3M67W^1tt^n?%`Jl4a)P^c*jitNlF3C?xU<0&N~WfxJ=I{5zfV zjD-nAvtVR@k{Z&$$6fs%jCkDW4G74lSv(g9#8eP!(H0K`4hVl| z0r`#c1!}iYgJyl@GG@Q|>oO9cxz%=Dx6}O`e51147GBiXRU>psDpGy;Z-+xbfFCAG z&RSxsl)xP@U!671v>S5L2#p24mHzwY6?+uB%&EOL3~g*{40;Jsj$41 z`7|wpc`;@&qhf{3R)^bHrNX(ganvpp&8FaDujtoDr!RB^CuX4)yMMdwEgV!uH-UT? zgS0MW7r3AF36{xqU+5xUMt<}_cp~Ow7Um$C#wHW0}zBHRPf5dw(}kQ zH*pthFX9cHiTq1lF;bDH$|X?=x)JsgwNMT60(I7Zy4Y0sK~n)xX)84A&!xWI=49s4 z6vPn~R15>aMOv-&m`n?XFt@)3Rym> z|A$&^sNC-F0D=w%f;wo}fb$7+y;joT;^K;uY$5??i0wDEDA!iD3IyPA%d`lZ4n>OV z8@UOmY)9ful6c4oWbzmM^gxJhj=eu!-ez~&9m*q8^gXR^{RKrQ7J1X(v4rKb!oR@Z zjZ;l*zHNFBP3#L8Gwd)H#OJV8N3{FJ3nm6`n#6a`Xk)}uP$=I7^XdDz>2YAZ{~f+i zD!xVnf4#xER_@hw!=|v9WnQ!0s%ko0(CWZZMW>D-8t|l_lvdYDakDhf~hrJ^lnwc{71gQRn-^x>~sh49mdg2Z{>it>ROW zOk#m!`$38wKn06}hu6U|o7b%e`N3Bet4w%11pGZwNpKa6EfYiOM9^th{CwFczz^i$ zP4ScZfLZCg^_O_MfeI_(?Y=;Nm?1qXbG~8j0P%?9Cuw=lmoY)ls~uLV?-5UZT--AU zy{&!oNu1rV`hs)ebZYV{>gtOo0M0P0L>seNtyPb0KM#x{fBu!OkTf`b)0Q30x~BNb zu?*)wy*xWtKWr9C&YdesgeZyFEalZAkHP!cL$zaj+&W{iX6fz2+6<-}1JN{3(06YG z`T|-a@7pFk5+kaA>u%XlRry*iJsW4nH5inNK;kX$PPXQE?fNgs@%t%Dtscys8nd^w)ezxPym!a@)WrBFJXa1H#5QOeRwpjz~UbXd9 ze4isV3A-|w-oMF1FrAFJU=S^$#|25!i(oAkZ43GJ<0FHc!QQ-pI7<6Q(b4Hp;iIUi zh%ar$0qtUR*-!d)x@3wgq}?;HEJxy1u!L#BLLc^mQuE4q3502+T!Uw)=+)Clw1b`3)56;k%^)zPW{@WBe}x}kq`3ogaADgPNvC zLoDQ6GBm+I3hg=j$=wVmy?Qsg#tA--FLidWq2EOoUr1kizM1xZZve+#(PAUu2JEo3 z9YI-Wo%Wey`FX>`C&qe&^y`7pI{7nfXMEGaDmiQq7HsJc|7P|gP=-c|p9l5^LJ%f& zJH%VhQ? z`#kbMAkhN-`^uImj9deATujQ<8*?wAbH}e?z4sI0t&|j*93L;9K-e3Nf+J^yN?QI` z(mD_=sPoq!Xd75eKeROW3l^8+(@2CMJ$&Ep+aGof`e2dWS|Y~SX_q6s-VEjWVY4o3 zz8B5-BvfCdA7(@~5oIAFNxseoV?Y)MiJmctWQec3qb!4)5%q4`7Zy{KPW#9Pl;P)^ zny$%1`c9xX0?(n@Y$<7JRv_ zN35x{iPi4*cH0*Ecq%*^B@XbdaMUD;X_@ZE#XJ|WrmTayCWqZ17cO+gsG@__7SL76&S6a9zwY*pnq z@nrXQ$~lucl_BvzPC}_(<; zq`w`4xLr%>M_ksd-yBT3MMf!gy_{5bPV=5tb>E*X>4m5N?f?acva=%k2?6p0r~Sk% zcK}y5Y?ZRg6_5MbW1beYZJr!9Ak*+3F|qS!55dewvWB-t$9lRw?<_I6!ASJ__h+h* z*r%9>c8Ne=@bGOp)F0Nje3xBp#QN*sYHyko!|yBWiI%1LwgJ!kBF~6+EC7+}A6Mq& z8uhqB{P1E82_UVmXkmDnUJ+s8IbkpLty|uw%F1{sc*(yqxg59Lo?Z~2j@j>W71^7k zWL9QcB34?gz8UiSJe;8e>iK}%&OJQAd4og@mi_5^>m}%{TUI?zl@~L0w?%3lZ=kp! zNM?MWh;HU+m8W@dofnZPvUx!qP<^Npn}+|gOoi~vUeMJ>Oj9#ur=)d5FIe6VVW2x@ zyT1o4rfF_ZFcj4^+gNNSc(ZE$#U^*uo{oVc)WIB#d~ApZ{W@#Y=?Fp}9OBpb=}s4I zP&uUis|Ja&{3-~U83JmlS7#Xnw5AP77g@GvM1;svM=Ka@pirO8%Xd;SBAs?;3oV}~ zSQ6@D4HiMf2xMZ#lqcZQax-kPTaEp@`}`Fnt@Ymg5Xc@g80EIvJ&}43{+)akXMgiu3BQ_cijRxIk^t;{^Ud@KazY` zOi`Als60c^_P|SR%Nx>Ra}jbxz0z8iySFG|F+)SCO-mS}rCx>Z=8gf>QZIO=GfCeC z4u&XcXkye_m*YiQc~Pa^6JT+1{-}v3r#kMo!}Zvi3r;em|A!o`q|G77mNanz4CO}* zCFjd|>o!yXaJS@BVb`1M@q6`3Pfwrx`m4$-$;7}$K){BqDwz7aa&EG@pW{n}Mu~;v zQl{x!LMrD7iJ|ST!f(oTQskY|B4@Hts57nsgv~f_f z;04oHF4H`vYoV~{O)Je>y4Sca(qz#3cj1Znk4L!y?oz5anygr|P8vE6)|D53<_gEt zE)iDiOvElJT3a)&P)14xV{@y4+^%Z-b^xC=w~~Z zR_CutD(#rQMO)er($%!>GPW(MOi>p1EQ;Ar-xc}Q+>nsfjP#t&0_}%K@jHP`ZnL{T zbF55y41M~^=x~=DsG4|Gbr-?#nFlnpPsEDDxI>9KhmRhDYiThix0a6lOwag|juWYZ z6b?gzS9(g5{V5ILKK$`=I{hEBLB9d3?l4v(jWt}Gu4*1wP672}G~sm4KB(3-=-3Wk zTioLHNqaGL_AZr#C>pKvuV9i~li%OxZ0SJ|QUpci)JKEX$Ir+uopP!s!-&_OUltb^ z2Y`lcyQ($MwZL$H{)70JJ(VQi4y`~z*T%_a%G7qqQf!WZx9iclwDUV7;9@V8{#s&k z*cRi#8DnE#3?-Nh+=0MSv?jm+L8c|34x$g4*nJShnw5Y0Jfc2rAwCW7Nzr|&0Tq@I zpp%^3?eF4^ZE7$AW|dgt8q-G`!$a!ymiRg0zyoz0w?)V+!|0$f9#jVeH3RL4Y?rhA z>}8-(LShPDy%x?4(e?JXZhZ!waqO7J*Db&%`ik87xallqX-)CQL{j*0D$&3iD#Nap z|L6N-4m2`R);Fs!{q<(&Ehxf3JDPMB77k*Dxo`gkEm1{@JBv~OXV@48!yGhek&r%d z6H}RlHkKF`NvLx4x$W~&$|F*Z2qq%y^psCQWKFHPk}Jg37yg?u@;xuctmK7jv5XG8 z>Thqr?dvj>dO7g{*bXVPWzY`V3o{BlSWjm>pv40;Db8)Knjn0`&Nu@LG0~WF>$4m6 zkI|%THjf!Xiuv1L%H%yxxJnQ9Ff$<gC4-IkF#^ht9?UX6s zNSO4l7E8qwIhw^i7NKa^q(W@*OqApRm7N5ni{^FUX!G6!x6E!#-(*^kFudS1=PFlM zP+==Q-i*q-E~@MBLa2!z*vRn$`rC18%HxKnvr=x_g@q&eK8Zf31ZxK&E!@TA6AHvN za>aF*$iv#?n4G9dI&9Nmt{3GiP45mwzwfcCy0exLNdoQzEc#G(g}ANcQM-YHNNru& zt|7yS79o4{{^)P>RIT!I%FJvcRr`8$@8L93{(J`o(tEectd@!@t-dat2e(t9INC{R z^m~g`vE9(-iNj9HJvt|`eh;fwwW6p1GWRsZ$pPqmI|=#y)$xXzgB)Lu9M9b-I>05D z15h#j$#CiA(5|6{h|_ngPKZT0lLtMAw;?e2Sg$A65=iAFuH9>aW;jYa&$@D;KSRDH zJn|uCsj529jDkvf!0I&?@nc3%riAOjJ0UgR51iOJpvF5WtYx0-g&xX}G#Vub12@`B zuw=%+MSfyuP1vWKwc;o*NPz&8BeDm+aVPexqOY&7vIOfwsS>u^+j5lShVSjq<39E5 zhYS#-M_06BB_TJOrg53e1ADKAuPW0TM!_<4KNp&slYMhMwNLrPe=d=J6 zO8XzqV@6~5fqVx~WDF4iM~f+h5{J%rP~Y&E`_s|J2r77(pP@!0Y**QR>{2d7Mokgk zg*-LqwsCSk;jmkW($Yz_C)3${t)On_&5K{^3Xaz>MxW7V-MdpRrLp{JJ%d0;Gzic_ zGMUPBhSA%bLv01wgxZHA1oT&x*5@|22{e->rN9#9 z$usD+yrKWNV7=?{zL{CI{TNngb=XXCSZ_JE zn}_6BDL!*iL+@~soK%j}bgzgVtBa!MOvgf=n##ss)J!V(HMHHJ-hj^z=@`Ks9AUkj z*lJi03hupL_s8|3W0*Ojq@s$Dz#T16(v4*z%acXL!rACDeJUx!Htg>3h&TRvX92S;1VxbA;Wzj4SWf??G{nG`?VFZby#F zN2G)Epy=3SGr?u$WZO(N{IZLW%f)&-xo`9v%r-e~61RnzFYA6w!wPBUN)Ml;YTWPT z&g-=_*B>p6x${1+&tn(3zvfO4^?c!^^(dM+U zqaFL#l;fr+IhlmGr0x!}T{I&$76!!49LHQK^d|bS48umli+6@%I@GoWt}bAjBu6Fh z(5|MdgTBC%!H1T0P*j1%8@6=^h%s)1(Mucll!KxE3g}O@G3=mV(YNYoTPf=0md~xT z6v?>G2aRySN|ibMb)JyarV9zR$p)2e!;S9bnxCGQZGsvtX0FLd>c}EZ+Yu6Yu`;>A z$HZ`PEfV1(=vG;8fwl+Wrln_aLQoBi;ND=@h-|fRQ2?M!fbvU^an0LUIaGh%_~Cnn z%!(YX(bT&A`u6FaJIy?0vM`|)ca3_jBdZ!-3Y2@79S zLj9bSPI!hn8GaPwsO5TZ^s?l$L>OHWy6j}$R@U0=*wIZbh((!oH4Phcpg;UrK6^m^ z#qp?kXnR~!b%YZtRI2v~cvZBI z+hS;`jh{7|)!v@6Z_qz@_X23>FqQBRm(0qV`I27p&;}C*Xy^p!8QCUV8f6lxP6c1j z8dFyT&e<&{l31_`bU9-?kddJ3Tw;f(j(a?|ypDV$zxEyL_t7CBWt=Zhh}&>z4Ww(o zx8zou@+XHU0W_~sy*Dde)b>U?z@M}BpBh(3U#RqJFO5mdF7}ZvbxG@*gy`puy_sn^ zez!+iaeFC{_-Qp-imE4Fzf*@?;PK2g4cNh2+h)t|GeVJMS?w7_F@T)TR%lcbI~j35 zr(5M554%xp)sR)Il^OL91xD`j5SF6mtmf?Kua5+U7t5k+;+sP+1$;+j7xYpl>tV06 z_ViZKJsZ}AGf?Nsm-Gxh7K3qS(;{7c9?)pMeuvsg1s1^cdK!NU^g zJE!aL!Li?)A{>QsbR87hx=4jwBmyC!89Qzc`cEq@V+_m_qNv4VK27xkTbo>8-Ff#v z2WIf9yKCu=*>5^oXhPrK$MtBvJG0qUx}A~1>)Ukg)OOM4v(Tj=H~&=M+@BP-S3N1j@?fD&3Up;u>=kdoX!vA z!VmV&jbsl)j(OcgkpObe^h-nMzJ42t@H!us)@XDUYYW(hvYpKCx;*zcua1yaq4lp? zc81p))}gtCRY2!de^Cx3AvkPSeFl{>An-X}tyJ;-tYzJy^PzRy|CG>mjFuT>Zf>q4 zG&FQ?e0Tm$O*4ZzCB}OWKdH>um1Wsqnt3ekYKKa)b){jS9%m2FXC{g8fST z@oM|dOwQQ(TweAHv}#Iv1ud;hW@BT~_~b-2?i2e%hSm*MeN>GZ6xt2yFN_xMXDd$j zz^GD+`^lmW2P2F}i}jr7+1W6`NWmBCf_v+!OtOIY`(=zbHm+dmoIw_!^LBTB&nshw z$l7&R_#btkT#4k&vASSR+2H*C?;D#?vusV_mW*;r-f#xgV{;OoWQ@F_I30L2H-dOz zgKM>jt(*{4`dW(&8%4D?78|D^JgQ8JWUd2{&~z{eXia@|WG2Sv&2=3$x}ST$tSlsR zcvl9>7Ce6O5}`0;t`iQGWUin8iJy{h8vf!F5WJ_1b9Asy8!OhalgOFE)H!t-+&ClX zBcOk>9T^bu2DEs)!-T0(0k{Y@Z3=R7e1QYYWZWZ45s3ILIrO7_g>3ld@KWvZO{aa2 zb~Q^^mPBGjj+7s9@vxY)yh);DKk{^-%3-vZl>IMZ44lWl~gl z)F}$t$kCN^qSCVN;gd7X`zt#4RYhscj`=!LATt5U73Uw`x>Rd3L6Y{BgFjT_9pd@Z zCL&gNl{1I?C&<^nP3Qy|(4f%l8Z+b69sN@^N9|Tn1& zFVVs2p!nwjoG;6#^Q3m_rA974Iracn+{!2=krk6BY8fD0@vFd?%NwZT=Xt(ZL`_86 znkOio!r6XFC=5L56vXFO9iA^CO+DsFhQ46`8@kRQj?@C;F@S$e<>_NB4uDB1osYI+e9k5YUS)r-}7y&^5P8!D!ilGvsIiiNw0}W z2II)wtFX{AzJd`M9-T3xD_+s&w_52MO_}R`+K?SLPbH z+CfE)!9Nww)D7##Za(E@MVbmwNX zN3VX9*nduN)Mu9xDDq!_^qxBTYr>7p&F?FpIfTeP2CE^n^zgu?7GFZ7EAYfrJ)%;?dZm9GijRU1Rvu?|+6neWFthdh1 zpj73DcPa}(CW4b4m5jx^Uxa7sx&jToD5@m^dt9pN{dj@?Rb#-H_F6DBG&BWtFcMgA zbv@U54!>YB>WEK15I{H7k(r=oWDI5OKa;Ckt zOjIWjqKL*?CLo%L5`n1r1;2OTkwWN7ho}OL9YSP*yV;@6i;h#hNc9#bpxSIs|5_?A z2j*bZqJL&sd&HjaXWDUlnCbYzY&n&N*)@~*5puU@-gZQ`6>p`q#OO54$r;S1S4%px zJhUmE3snnChv-zod9G&6fmJg6GN7;SMTP-6Y_eKt;}w0Aa6C`m!*i-`)|o{W64DhX zunXf9zC%4`O0sP}#fFjT?~znOD=5|Y@*O7B0BRn<{W|-(B$MVp9#lonxS$mWkM!fJl+{e12H@_4yzlBcGWTdNX$A5iWwd>nvelbbf>UaRaIq8;31BJ|1(#p7d7h?^X!-rA9!eC zC-J3r;p!XxHii(0DS9O-a7y}$PV&qCqWALyUWRXpaK#?3;kJ)%z}pgy%dm@tLol?c z;3Y0CXvL=Dp8Qxt(Xl?^4NGDe2SI}IK43~f39}qt4c$;X@9{JJ`>*fo&lap4A>0T~ z9N-zv{BGfhqqY{+%a8|8MtpS<1LWNoOjht{@R6o;Q-Q~L8daICB$WPyF17sgq6rKV zo^HitNvC&b@tJXjHJz2*6Lo7Cna!^Q=e(spvN@XGBb>V1CjPc>nfYX&DU5wb_xmF0 zr7b>dj6Tn4`loJ}TYhMX_Ih43jy~gyjjTAZ7QLbZOj*Xuw1_Z05?@Pf`5`1ZTZbpW;=~|P`E6_vQ5Wycoy4H*Sxu2A;u84mU zl=47%x?C$1@3+irIvVX~%g;y!y$9&UKIFRf{N#PsbZYoH*~U^8XES|Howtf$auRDU zp|7gB$q`yU;LqW_PHVuwQ8)T2V<_!KsGK zolJ^8BU(%)Yy1;vYJ6KRdLTIXMG&lw9;|DS6lo5s_G=v6;&@NubK znRV=FmEi*~@{D4(q4#ld&i?0wA>Y}-+|3l-04?U2c;WDa3Oq9=P3$)c27Se{CRpe< z)6!>`9+qesnn}7k>&|i#A$r#qkR1Y!YFdJ(7j4<;!=i6b$td69o5mW%SxGkZ>%bRu z*zcN&caQx|hD%3)wzp?qfaNb|qXEjHrDXdirc)PlM5#BvyG+NP z_SmQ`&HA4fuo`qhX$yU;Goi)`6$YITDK|qfz9S7n+0kff9!;*VcIz$GxA+3kjbN%cp`4nLjL;;D!|Ww2I2SWHwv2IipZy^VkE#03 z;~tdyXSDhXEk`iDon@z{^(x6*dDC}VzytXh5udY3H1zEZnBI6rpVO!dCRKsrKst~T zD71S5OeN_|*p7+fR=-xb%BXvRLePj7Vj}#x!>lwUdMTz);$5Lj8`?)T( z-oDip2nYHoikYWHm>Ow=rx`CXlj6~pL;3!_1TcLKTN~jwYv_pySwn4`w#Q&zAy0rO zye+JW0ut9BB9n71hcs`!4y)&3gZx$k^CHHhQ!ukP0l))c6QDXrnQ8>3t zP>8a{+Su~K^>ay+N7hnR0Uwy*^4>O-D6@1aLgfS_tr7x-*wW;%=?xhzG^y8Y+Z^Tz z$CdyF6a6upFIJC|1ht}_GwfCS{;OJV8K08imqHJ`1#DNN>ST%7tp^jt*#R&Qb!MkM zy|GYW4y6a`al>T>#JC%0cR2@Aq@pP?EEwoi(0#Q&<%~RX1O~T^Hh~Fk_Nj*6w4`NQ5|sJdC@qa0iX zyA+;tN%+d;%KDBx?$oIBq@pqf{L*Hga_78^GN+vJ4S2{#sEp_U*3>a;Ldbvav*&H| zv|FNvUK{}vUoZ1#6B90nv5lH&wwJ(f#-}H*6WVe+0t$L;P$ptjR6D>1ydXnch#?+A zK0h9NFL&|+=wEvr3&uh}T@QuUAU--8wlg4tehy|$$YRjjh&$N=lRCh6+)Hm~#d?V9 zuTsk!cjR{ZiZ{GH5Qu)W9 z1B+>yFIM|eB292e%m7yM*W;q5rwo+kPVug@d+ftJq5wfG8_^`gZ9$=iE6o$H{#-YW zp@wvRbZ+|t$;c_4kMP{?Vc$9BFp~FFSiPY z>tXEP_`T3o>+zEAhqbR_%KPoY+-i}s^aqnjs7RA{&w!#Ynd~+V*uiaB4BNQ$>;oj~ z)htHGfmn`eFq-(0^OXd-tr-II^xSWP)_1?q3+M;{5lDn7cbD4%oQxy^`SPSC&smmg zP-LGQs`1oH(WS-XD2uB*r~X*bTBd4zkV*<{q!$5wq|4#>9H)Wb730e?W(I6ynlZTT z3<`hyzg`<-&3&CwiLOg2Xt~*nqz{X$v0RK$$j(h^bKLo@)~mVo%X_(@awX%r!SdnB z7WeuKOL;oTBeo9e@YZnM#-HR(-cH@-a_n4wngVvrzCigpU;yb7>NgJUMv4&lgiL$Z z)eTikPGUOH1M(|Oy9IQD-9xxfwfXRetMw`{K%6~nvURC!#IkWsn8|71fe~z5OOVg( zoYywQE+5MqHI^nYhj^6ZbMYcY?009zy(%u=^YgdjdjC%(Fe5uushq$x#*2;iTT`qr zF{_1GKX(#=X}P(4c`x(~Zj+?P0kgGxPA|RkT?`88tOryS5Cs?-^6OPebTU@04lK_m zj-B{5==k4-xgBbQ$*K(VYj@4DcB7%h-3rT}mJT?+K3x7Tb_WdGw7-7OslWZ{ZzrNV z;5apu_$0lSEFi!2QF`nz@G<2iNnvMe7fpL1CLk+&&D5YaOWh5w<&~kx{=${{z(?iN zSp7x($R)DEde)YzLjJ8D=mUF&c|feze`>EFR9Fu{c_xE=zf=Omzq2T`bC8o4rjv`R z_VCL@P&vTiHTXqh8;>|+gcaI%d*ACmj^I1`egj*w!H67Ey2Kjl%pA{^=nVae5CuXX zVKexe<=$xW_H?Z@Xz_?uU&{wt?*o424tnsW;Kgd;P0&SE^?>^|{`WPn8OfJ!Wt+}E zQp4nF&k3h*qH_eXv-ln65eG&J6*h5d6rxsK7;$bU`vs36SAUR-OgEqUQc9>1@uAUa4!|>Dj-+v*| z;9s9m5gwIu!EGuJwaX+0HSLR0d=;XL1_AQAG4nxlXU8_h?di&c z@a~ZmFRDcR17h~$SacO?I)I~rdI<^fhHPeiR1EuxJT~*XzC;rEWHtovL{YDQl;Wd^ zqYGOWg+{F_HW8mY9J?=sqENaXT@O^WvBEu-8u*p{a(kl@luO)`#FxEmAGp_@RnyE_zERC^>;d==tg zb2SVyojr?Gg9lhRv4K?^IQcgdPZ(S1psY~U?~&vNP)Xs6`kueT7My-vq3Sl+I|Va? zhUIALMn+B!d@AH&A0b61t()mC5N&snJ7KmVs$8LE+0u&=ykX0*k=pt@RX#J}tX%|( zmGoxs?0_0i4yU+<{*s7Z>=P{@RHU45MuxxIMZbM0V1q&*hyiupsB6VHNvH9OAaPX5E%mDu3g20 znlG_~q6R@cJMKhVna#hLMV>2{W~dxqvi>rx{cD0gN@}D<1OUeY5;Z_=RI!Vv`%-fR zMd6!gBh65LW;|@X!q?xLkMY3tz8)-7aQX&g8OaTALDM~GQsZoUIQmCni~`*z>-sFe zpB_?i1nr2W2!()P0J&jkMY7zmt6eeOXRC4yVJ5-e%db> zCwNL&w+L`wvS3R!Ytok z;@n*4I>31uDsDe)Y}!@tB^Q=q1jCbk?C#Yh`Kh=G&Z;jbs-;3TB&ko_a>nw>m(mughWdX!bK6vZDDeAo-r4*wocknMb5hzOxLEdSu%S!uvl(Yq^bq;QM zpzgc}4d)=;Fv-k8(p`V*7dJj`b&U=OPBL(I!&wMSOC#1qUVY2m$w7tMC0v>A46J-wSfFmrJl&a9&Isze(OJGjpt60)>ss_)m{{6eQ>dJ; ze$uxUc*Lia&^}GhKs7@5yyhr!meY2)I7hBSo?c@!UgT0VIg$^u16bKW?T^8&<3a8S zX_L!mFe_ngKs>G{--FHRcTH4HE0{{!%azAL@bXiVNzTOav@_mf`)ITQZ?YQZv)CL` z(~){+4i-LO&~k)M>8(WparKK8JR3PLqT=%QXK*~wC%*Kncbx8k@W`3d;&YBcLC*u# zBt7l-N{9bdKQwj&aXY+3ITXsg(egJ+ZZknS3O>}FDU17U1i*jT zLTYLMKD5O`flcEW@2MT3H1?W$I^-IfgNf6sV|pVVh&e;$+l&bxj18^>egxQbEb!t^ zt8euTwIoP0>?s3tUOTO{Fid)bZ*xd(P*rd4f$>u|p6k#rSU@9k6o)#J6$y^(IO8jS z4GVUBQ_e*#+&8@A*3qA^;v+Z@%rigU*|avIx%yx{Le7tfR?UVHN>Hq>LS}{CV;V8y zC7)K_u|$0_R8H1WN;&#T{l2IQNN6)i-#so>Q5nxKjm$lNRW6(F#t2ycTI(wbT{A9^`YmkQj>v6EltJ#n zgaf~Ny|R^6P*0og11z*;klAW3cqFh;_yzC4T+n^bVxvtTBXxz-k#THd;sIg>ZD7+~ z#CE{^hA%6UpJx4(-&?J1tL#&(aKxF6Oxq|kUz@h zJ{s?ST<8$_dCZrhZp`GmiI1y;RHF~hjLF@$?P1kz zxa5yFlL3YOWkAiQQ=d~!V>AxaF*lkG;*ZU9*f&NYMXO}3B@a>Ubc=-gy>7iT)L=sM z5w`2m^z>LrNf|j_nVz{Dd4n0_Knc54Ta%Jb#fYnvB58banaQp;mpP{uml&v@H}EqA z3kPJ)=U+`Uegv{3Eo6?Y_aI)NPfaL~a(#ecTy_SFXU)9oA<$DXd+NU6GnV36g}4t7 znNjFFn@QD)$Rj8EPXNPPuSAbt1l^noeygJ_kJ3X&=N4J^9hj}*@q1x57_wZcdxy-w zRCMxh2*Av@zNCna&*|sfTy$RcjI+ga3mUb;g5f?)(ADZkrS%~CED@vW1%8c)A@YU( zvO`}mnZ{xky63(wsfDVE?4vwZjwO0%a_#?9vueRA?0C@s`YR=79pEs`^mrk9t=2+? z5uSQu76aF{M8YDs(}xK{|I9>;LHc#}X^uiiSgjctpI=;WffWf_|K`6N^<|`x+NJi4 z0ZjsZg=!4+hyWL;?{6QGX26es<&-hm%|$B3i(j31H+2s=yey<%^9`q&cFo$W#pL>y zMY*Owl$4JL!I`B=a5S})s*IH>yB=JEsE;MpXCb`M3h5u}xIfm8wer5TS*}(}tlfv; z5vufqz<6Xa+8Hep_*KSgf?7VD2}v++l-ohUL4?vpjp)xBX9dhmw8E_>L3&Oro#d~+ z=}>G=;iQTR5#ZOE<3Hr5P04A9vae?J7Eo1^7fevFm7akeDJiDj+l-zM zgs)R6M4KUw)xpQgnkUcnqytzkI)1;35{4d`83ZA$UU$t9Ut-))CQx)|6k%@XC2W|f0c{A>?XEtw?yOfEl%>ucZdtl{K5o(fnbW|DV(X8LE}kAP?7`r{(4_35%9L=}qv^$;J5#@7{u)nWT~zi5-D<*~xpG^z+#-j_c1(_9 z7{1_HYG`qkk_8CtAlTubK{)&)`5SCkAwHNHqwmb4)JSoh7JLw*HA(aO);PpA#?;I3eq7WMT$XCsz@jFDh3IlQ~@c{rI*kJ3<%Omz)R6k z#ZbjBT<5#*?3we|*)wa_*?XTgYd_B~fUh#RHnTypyX=zDOQA$mD827-4osQjig3yczZRlJJ0l)NE@2pK^Qs(;rjxx0kab z{QWA-^Qkbwxy@WqDYZyx3%+Ced4Go?vf;u$x# z^V^aD+?NsMQ7SGz_#HfCg{*K**4JHsO?mflJ1CF)$UMi<4@3*z| zRvdDQ2fG0fc-)#x?)*J(A&Zpe2+9o?CuBd`w6f*Um9(_=ts=oKOQ;8$UCWvNpjl&9 z+y;XR0EHX@TMLMZTanTWSf@5(rnT1*JIhAM__ywdsucpzhXN+}(9p{!r5lmg+s;s- zL0z6#SF866rt3$W$&MnI{GGZCUuwyTRzqNT-}dA;`~F`dO|zN)L64YArs#UJ8<2>n z)7!WDKEfDaefUoXmyRmJf$&-9?gR;Dgp}u8jKov^`GY4I`}g0bp z?f8WqaZwem*T%?WCePpe5E3_bt1VLh0oC=CJqBdsL7m$~pX6E@O}Rr$8USakWZD2; z=tHERCEd)jyl0hKl=iH4#?r??<8j!Brl+4%iFwlQ)(ajM7nfTCxA=I?+k?R~<85~W zs3~P(KYvNUeo|mIIG`BMC+2SbN&JoZp3T{!!b~IJJZ+9F1M3@6V)%s;)h*@;7L5j7 zswQ4xax+yRduNmaG^g(R`OQ~CetSQMuiBVXccV3U>_eGiUbE=Mn{~e*e3mikX3>&W zI3iGN>jF29k*@V!I*gTyS5Xb_tU+UGjVQ&v*cmhP9Af63%XNrz z{aQO;_>ADhI~DQ{v5nA1ZMiZwf(XfJ=-?*}Kj?z5EG6APQ_1LuI65LU_ZRbYCVsZ^ zYV*VdRmN3Yy`A?6lgmfbam$3pXGUYNwyxtY57LjghbP64`3JAMcdEE^*;f?NxD0-2 z5qi)$ZlHMGP(vhfKvsUELyAT^_%uJFVDxOgq!8zt)_8ZAifyo zX)V&AM!$=&^|;G6D7U=$9QdIM`aYPT!)aT$Ozw2ECtagvG>`ktp)a@EzYK)sW43BC z%Rl|Bp_aCLBP{eDOt!v%qk*oM70QLDa0p9{gryi>haKq~8%M4``PT4s&Yy#o64MYm z&~Y)0C_+JbLP>h7(dkL><znWiG$y|xXF zob62Ag@j3kEjW7-D7An-Ex}CL8uS@eVWrxmI;tbDYKjKl<;9zxSyClqYzpe_jOKER zG$v-iS@K$7ipHwG^5Q4q3rG-4ol0KFS;_RfZ0q-5U#LSx+>6Yc*0t3h-W~Avaw}la zbRJRa!R>t}S&+EzT+XLxmxq~qB%&Ob!7Q-&9?M+wL=r#m4guM;&YfvYzQmMFCRYt z>I~&IsQW8?W>gg4m-f{5?-4Z<)_KoKmVn4+&Z9I`k@K}XOdE~qXYeM{EO7T3veFTi zr03Y!3V=HA*>MP&cR?H<`pQPleT>1a)cN>$OLG)Y{BgdANO^(wb$raZoyPt( z;gOg8w2C2rTwm{oAqqfKmUEVUJh!W$Z<&o8>IHa~B!AA+-Id(-`NvK%PgWt*wnMc$ zYVmS|yoLRG4WHf-)h^#W?~xzoqoYwI#LIg7lX2q$dvp}jb4+Cd)}cYG*m02HOG&%ZK7{F>Xdt3>%T3+Lykl1T>KqO;nTy*FIc zdI)RaWq%K7b zK>PRL^`mcoN%}1O_C3=BC#OPM+40}N7E`nXb?UWLl%SW{)%m%4hp{ay(hs|C-v{bR zBtKB|OEm+Lpo(5(Uci@B+MA2927Ql6Hk^0{gzAvOA8PM*K;opGt9blc;a$WP9DAd! zifSiw1%;miGFfh*oAw0niA?1$;>j%u{xxZV@OiD3@!X>&%#W50Y}`ajy1s>GgsVrB zo+_X8M-l6{Tqvg&y}$x#RRHY$8-`i^KsuWMV&md~4=-SHHLO5u(uYjbAoDchqQh8C zd{jaZ*FbTbk~>PNt0-Y6jOYah?IhyM{Ls2&wJ#bha%a=zQh-WST8q9;;ViHvU&u$@ zs$mmX;|a^vN4bJG8gIfZG;RFuEQaK6KK}9!r%)+AoqXNkNLV1l%yuudt8`P!?Jkm; z3SmFX9$x%lEXb4G{Z|AY95{6;<@OHGoC9$LNL{a9?Bn!)SmIH>nuV%0E;9R-sy3_o zqfKp$Iwg->+2kTMzr*bwnRGmlksS3JpJv=&fy4R*6(9gJQ`vTBkKA~9^YWwWPx%JR2xOq=f|*t7reLJJnYz1(5)l;8xt{J zOYLp!R?QXucJXSh2-LQQ@p$cY?QiauFO$R#as`-4Yn|~5bHG z+#s)Bi|-FGw-m6#+ytjJ9s!P!t4)WSkoa7x{L6hI0boN9oPYVFxMF{L=z`ib_jahf zm-s^Yw{*`FhT~LONIzv!eFe72gNlJuLBr6M-#%!wzIO0Ks|Q5q{xCv&LHZm;q{<*~ zIb?|u)Yj}dw`o{^{_YV?+pB-Y(bS`1NRR|zn56_fc$49Srvdi|u zw3`G$#rds;B&~=WhLP_KLaQ4x_4Cl4aYb@08(JF4;4b*CYxcjcU8;WW-Jt=+{}@5ZgUfgTC<=E9U$)z$P9!)sa#dUCJiF0D9&_k{I zdBw}kI{*%Ao&Dd%F#>_EGOO44{p}*wGYGIU?`JE0iH-Wdfo5iZ&VFkMC-;b~&%0$C zmjCNlx$UUX36r>}bcw0H@^JQ_c4dV`2|6Rg4-%0Z10eW*8DgttVX5@n|*lBM-qNS?h*H!TP>EliIq`EDy&OZ$ai)$L(TjhL5EK%!l{v zZ2zBbMCKv!0WxJQ?`)IhKiiBq%eQ2AGT>xGklS;WqBR}9r!ryh1TJd~X1H_ws`(mn zag?NVITMg#F3PWfUTDRtx2iB&(`8Sud2Y$_Np+CzwMo97WwN;xwXGqV{ znU)YNN_D;vmyT+~1o}|Br9rI^ye}_(8YA4lhW9RaE7FR7lqsX3?_}%pxYZcXm?bjs16Ey2zDz?AAhy z9CobkNd6JillvpNKi=*g!t?QaT=0(neP})hie%Fhq)_TG;a4x_;|c%Zw+s>XzrUux z&l_D7CWk-pW`{nla%zdLP+Yp=U4g$z>TnE=l?;E^P(BA Cu`ROz From 7bf79a3aa3054feb2aec9d076cda71f22da27f58 Mon Sep 17 00:00:00 2001 From: btkcodedev Date: Mon, 17 Nov 2025 09:22:55 +0530 Subject: [PATCH 4/5] fix: move core HBase directory, removed unwanted fields, auth mechanism, update deps --- .../docs/sources/hbase/hbase_pre.md | 14 +- .../docs/sources/hbase/hbase_recipe.yml | 3 +- metadata-ingestion/setup.py | 3 +- .../datahub/ingestion/source/hbase/README.md | 328 ++++++++++ .../ingestion/source/hbase/__init__.py | 3 + .../ingestion/source/{sql => hbase}/hbase.py | 259 +++++--- .../tests/integration/hbase/__init__.py | 1 + .../tests/unit/test_hbase_config.py | 237 +++++++ .../tests/unit/test_hbase_source.py | 602 ++++++++++++++++++ 9 files changed, 1334 insertions(+), 116 deletions(-) create mode 100644 metadata-ingestion/src/datahub/ingestion/source/hbase/README.md create mode 100644 metadata-ingestion/src/datahub/ingestion/source/hbase/__init__.py rename metadata-ingestion/src/datahub/ingestion/source/{sql => hbase}/hbase.py (69%) create mode 100644 metadata-ingestion/tests/integration/hbase/__init__.py create mode 100644 metadata-ingestion/tests/unit/test_hbase_config.py create mode 100644 metadata-ingestion/tests/unit/test_hbase_source.py diff --git a/metadata-ingestion/docs/sources/hbase/hbase_pre.md b/metadata-ingestion/docs/sources/hbase/hbase_pre.md index 4e6d1d111f91bd..0e879d4107e9b1 100644 --- a/metadata-ingestion/docs/sources/hbase/hbase_pre.md +++ b/metadata-ingestion/docs/sources/hbase/hbase_pre.md @@ -1,6 +1,6 @@ ### Setup -This integration extracts metadata from Apache HBase via the Thrift API, including information about: +This integration extracts metadata from Apache HBase via the Thrift API using the `happybase` Python library, including information about: - Namespaces - Tables @@ -17,7 +17,7 @@ You'll need to have HBase Thrift server running and accessible with appropriate pip install 'acryl-datahub[hbase]' ``` - This will install `happybase` and `thrift` packages required for connecting to HBase. + This will install the `happybase` package required for connecting to HBase. 2. **HBase Thrift Server**: @@ -42,18 +42,16 @@ You'll need to have HBase Thrift server running and accessible with appropriate #### Authentication -- **No Authentication**: By default, the connector uses no authentication. -- **Kerberos**: Set `auth_mechanism: "KERBEROS"` in the configuration. -- **Custom Authentication**: Specify your authentication mechanism in the `auth_mechanism` field. +The current implementation supports unauthenticated connections to HBase Thrift server. :::note -For production deployments, it's recommended to use secure authentication mechanisms and SSL/TLS connections. +For production deployments, it's recommended to use secure connections and ensure your HBase Thrift server is properly secured with network-level access controls. ::: -:::caution +:::info -The connector samples column qualifiers to extract schema information. For tables with many column qualifiers, adjust the `max_column_qualifiers` parameter to control the sampling size and avoid performance issues. +The connector extracts column family metadata but does not sample individual column qualifiers. This ensures efficient metadata extraction without impacting HBase performance. ::: diff --git a/metadata-ingestion/docs/sources/hbase/hbase_recipe.yml b/metadata-ingestion/docs/sources/hbase/hbase_recipe.yml index 05dc123149094c..ebe55a78d92de4 100644 --- a/metadata-ingestion/docs/sources/hbase/hbase_recipe.yml +++ b/metadata-ingestion/docs/sources/hbase/hbase_recipe.yml @@ -7,7 +7,7 @@ source: # Optional configurations use_ssl: false - auth_mechanism: null # Options: null, KERBEROS, or custom + timeout: 30000 # connection timeout in milliseconds # Filtering patterns namespace_pattern: @@ -23,7 +23,6 @@ source: # Schema extraction options include_column_families: true - max_column_qualifiers: 100 # Environment and platform instance env: "PROD" diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 6876cb07982379..be881a198e2805 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -510,7 +510,6 @@ }, "hbase": { "happybase>=1.2.0", - "thrift>=0.13.0", }, "hive": sql_common | pyhive_common @@ -833,7 +832,7 @@ "glue = datahub.ingestion.source.aws.glue:GlueSource", "sagemaker = datahub.ingestion.source.aws.sagemaker:SagemakerSource", "hana = datahub.ingestion.source.sql.hana:HanaSource", - "hbase = datahub.ingestion.source.sql.hbase:HBaseSource", + "hbase = datahub.ingestion.source.hbase.hbase:HBaseSource", "hive = datahub.ingestion.source.sql.hive:HiveSource", "hive-metastore = datahub.ingestion.source.sql.hive_metastore:HiveMetastoreSource", "json-schema = datahub.ingestion.source.schema.json_schema:JsonSchemaSource", diff --git a/metadata-ingestion/src/datahub/ingestion/source/hbase/README.md b/metadata-ingestion/src/datahub/ingestion/source/hbase/README.md new file mode 100644 index 00000000000000..73e2f46376fc4f --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/hbase/README.md @@ -0,0 +1,328 @@ +# HBase Source + +## Overview + +The HBase source connector extracts metadata from Apache HBase clusters through the Thrift API using the `happybase` Python library. + +## Supported Features + +- **Namespaces**: Discovered as containers +- **Tables**: Extracted with full metadata +- **Column Families**: Schema extraction with column family details +- **Table Properties**: Compression, versions, TTL, and other configurations +- **Filtering**: Namespace and table patterns for selective ingestion +- **Stateful Ingestion**: Automatic detection of deleted entities + +## Installation + +The HBase source requires the `happybase` library: + +```bash +pip install 'acryl-datahub[hbase]' +# or +pip install happybase +``` + +## Prerequisites + +1. **HBase Thrift Server**: Must be running and accessible +2. **Network Access**: Connector must be able to reach the Thrift server (default port: 9090) +3. **Permissions**: Read access to HBase tables and metadata + +### Starting HBase Thrift Server + +If the Thrift server is not already running: + +```bash +# HBase 2.x +hbase thrift start -p 9090 + +# Or as a daemon +hbase-daemon.sh start thrift +``` + +## Configuration + +### Basic Configuration + +```yaml +source: + type: hbase + config: + host: localhost + port: 9090 +``` + +### Full Configuration + +```yaml +source: + type: hbase + config: + # Required + host: hbase.example.com + + # Optional + port: 9090 # Default: 9090 + use_ssl: false # Default: false + timeout: 30000 # Connection timeout in milliseconds, default: 30000 + + # Filtering + namespace_pattern: + allow: + - "prod_.*" + - "staging" + deny: + - ".*_test" + + table_pattern: + allow: + - ".*" + deny: + - ".*_temp" + - ".*_backup" + + # Schema extraction + include_column_families: true # Default: true + + # DataHub configuration + env: PROD # Default: PROD + platform_instance: hbase-cluster-1 # Optional + + # Stateful ingestion + stateful_ingestion: + enabled: true + remove_stale_metadata: true +``` + +## Configuration Options + +| Option | Type | Required | Default | Description | +| ------------------------- | ------- | -------- | --------- | -------------------------------------- | +| `host` | string | ✅ | - | HBase Thrift server hostname or IP | +| `port` | integer | ❌ | 9090 | HBase Thrift server port | +| `use_ssl` | boolean | ❌ | false | Use SSL/TLS for connection | +| `timeout` | integer | ❌ | 30000 | Connection timeout in milliseconds | +| `namespace_pattern` | object | ❌ | allow all | Regex patterns for namespace filtering | +| `table_pattern` | object | ❌ | allow all | Regex patterns for table filtering | +| `include_column_families` | boolean | ❌ | true | Include column families in schema | +| `env` | string | ❌ | PROD | Environment for URN construction | +| `platform_instance` | string | ❌ | None | Platform instance identifier | + +## Extracted Metadata + +### Namespaces + +- Container entities representing HBase namespaces +- Default namespace for tables without explicit namespace +- Namespace descriptions and properties + +### Tables + +- Dataset entities for each HBase table +- Qualified names with namespace prefix (e.g., `namespace:table`) +- Display names and descriptions + +### Schema + +- **Row Key**: Always included as the primary key field +- **Column Families**: Extracted as schema fields when `include_column_families: true` +- Field types and nullability information + +### Properties + +For each column family: + +- `maxVersions`: Maximum number of cell versions +- `compression`: Compression algorithm (NONE, SNAPPY, GZ, LZO, etc.) +- `inMemory`: Whether data is kept in memory +- `blockCacheEnabled`: Whether block cache is enabled +- `timeToLive`: TTL setting + +## Examples + +### Example 1: Basic Ingestion + +```yaml +source: + type: hbase + config: + host: localhost + port: 9090 + +sink: + type: datahub-rest + config: + server: http://localhost:8080 +``` + +### Example 2: Production Environment with Filtering + +```yaml +source: + type: hbase + config: + host: hbase-prod.company.com + port: 9090 + use_ssl: true + timeout: 60000 + + namespace_pattern: + allow: + - "prod_.*" + deny: + - "prod_test_.*" + + table_pattern: + deny: + - ".*_backup$" + - ".*_temp$" + + env: PROD + platform_instance: prod-cluster-1 + + stateful_ingestion: + enabled: true + remove_stale_metadata: true + +sink: + type: datahub-rest + config: + server: https://datahub.company.com +``` + +### Example 3: Multiple Clusters + +Ingest from multiple HBase clusters by creating separate recipe files: + +**cluster1-recipe.yml:** + +```yaml +source: + type: hbase + config: + host: hbase-cluster1.company.com + port: 9090 + platform_instance: cluster1 +``` + +**cluster2-recipe.yml:** + +```yaml +source: + type: hbase + config: + host: hbase-cluster2.company.com + port: 9090 + platform_instance: cluster2 +``` + +## Generated URNs + +- **Namespace Container**: `urn:li:container:` (based on namespace name) +- **Table Dataset**: `urn:li:dataset:(urn:li:dataPlatform:hbase,.,)` + +For tables in the default namespace: + +- Name: `table_name` +- Qualified Name: `table_name` + +For tables in named namespaces: + +- Name: `namespace.table_name` +- Qualified Name: `namespace:table_name` + +## Limitations + +1. **Column Qualifiers**: Individual column qualifiers are not extracted (only column families) +2. **Data Sampling**: No data profiling or sampling is performed +3. **Table Relationships**: Foreign key relationships are not extracted +4. **Row Count**: Table row counts are not extracted +5. **Authentication**: Currently supports unauthenticated connections only + +## Troubleshooting + +### Connection Timeout + +**Error**: `Failed to connect to HBase: Connection timed out` + +**Solutions**: + +- Verify HBase Thrift server is running: `netstat -an | grep 9090` +- Check network connectivity: `telnet hbase-host 9090` +- Increase timeout: `timeout: 60000` +- Verify firewall rules allow connection + +### Import Error + +**Error**: `Failed to import happybase library` + +**Solution**: + +```bash +pip install happybase +``` + +### Empty Results + +**Issue**: No tables are ingested + +**Solutions**: + +- Check namespace/table patterns are not too restrictive +- Verify tables exist: `echo "list" | hbase shell` +- Check HBase permissions +- Review ingestion report for dropped entities + +### Schema Not Extracted + +**Issue**: Tables ingested without schema information + +**Solution**: + +- Ensure `include_column_families: true` (default) +- Verify column families exist on tables +- Check for errors in ingestion report + +## Performance Considerations + +1. **Large Clusters**: For clusters with many tables, use filtering to reduce ingestion time +2. **Network Latency**: Higher latency networks may need increased timeout values +3. **Concurrent Access**: Multiple ingestions can run in parallel using different platform instances + +## Testing + +### Unit Tests + +```bash +pytest tests/unit/test_hbase_config.py -v +pytest tests/unit/test_hbase_source.py -v +``` + +### Integration Tests + +Requires Docker: + +```bash +cd tests/integration/hbase +docker-compose up -d +pytest test_hbase_integration.py -v +docker-compose down -v +``` + +See `tests/integration/hbase/README.md` for detailed testing instructions. + +## Questions & Support + +For questions or issues: + +1. Check the [DataHub documentation](https://datahubproject.io/) +2. Search [existing issues](https://github.com/datahub-project/datahub/issues) +3. Open a new issue with the `hbase-source` label + +## Related Documentation + +- [HBase Official Documentation](https://hbase.apache.org/) +- [HBase Thrift API](https://hbase.apache.org/book.html#thrift) +- [happybase Documentation](https://happybase.readthedocs.io/) +- [DataHub Sources Overview](https://datahubproject.io/docs/metadata-ingestion/) diff --git a/metadata-ingestion/src/datahub/ingestion/source/hbase/__init__.py b/metadata-ingestion/src/datahub/ingestion/source/hbase/__init__.py new file mode 100644 index 00000000000000..a71f61f90568dc --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/hbase/__init__.py @@ -0,0 +1,3 @@ +from datahub.ingestion.source.hbase.hbase import HBaseSource + +__all__ = ["HBaseSource"] diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/hbase.py b/metadata-ingestion/src/datahub/ingestion/source/hbase/hbase.py similarity index 69% rename from metadata-ingestion/src/datahub/ingestion/source/sql/hbase.py rename to metadata-ingestion/src/datahub/ingestion/source/hbase/hbase.py index 5316cfe69a2b23..c0da7c5aa4e203 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/hbase.py +++ b/metadata-ingestion/src/datahub/ingestion/source/hbase/hbase.py @@ -5,7 +5,7 @@ import logging from typing import Dict, Iterable, List, Optional, Union -from pydantic import Field +from pydantic import Field, field_validator from datahub.configuration.common import AllowDenyPattern, ConfigModel from datahub.emitter.mcp_builder import ContainerKey @@ -31,12 +31,12 @@ StatefulIngestionConfigBase, StatefulIngestionSourceBase, ) +from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaField from datahub.metadata.schema_classes import ( ArrayTypeClass, BooleanTypeClass, BytesTypeClass, NumberTypeClass, - SchemaField, SchemaFieldDataTypeClass, StringTypeClass, ) @@ -59,15 +59,15 @@ class HBaseSourceConfig(StatefulIngestionConfigBase): ) port: int = Field( default=9090, - description="HBase Thrift server port (default: 9090 for Thrift1)", + description="HBase Thrift server port (default: 9090)", ) use_ssl: bool = Field( default=False, description="Whether to use SSL/TLS for connection", ) - auth_mechanism: Optional[str] = Field( - default=None, - description="Authentication mechanism (None, KERBEROS, or custom)", + timeout: Optional[int] = Field( + default=30000, + description="Connection timeout in milliseconds (default: 30000)", ) namespace_pattern: AllowDenyPattern = Field( default=AllowDenyPattern.allow_all(), @@ -81,10 +81,6 @@ class HBaseSourceConfig(StatefulIngestionConfigBase): default=True, description="Include column families as schema metadata", ) - max_column_qualifiers: int = Field( - default=100, - description="Maximum number of column qualifiers to sample per column family", - ) env: str = Field( default="PROD", description="Environment to use in namespace when constructing URNs", @@ -94,6 +90,20 @@ class HBaseSourceConfig(StatefulIngestionConfigBase): description="Platform instance to use in namespace when constructing URNs", ) + @field_validator("port") + @classmethod + def validate_port(cls, v: int) -> int: + if not 1 <= v <= 65535: + raise ValueError("Port must be between 1 and 65535") + return v + + @field_validator("timeout") + @classmethod + def validate_timeout(cls, v: Optional[int]) -> Optional[int]: + if v is not None and v <= 0: + raise ValueError("Timeout must be positive") + return v + class HBaseSourceReport(ConfigModel): """ @@ -108,6 +118,10 @@ class HBaseSourceReport(ConfigModel): failures: List[Dict[str, str]] = [] warnings: List[Dict[str, str]] = [] + def close(self) -> None: + """Optional close method for report cleanup""" + pass + def report_entity_scanned(self, name: str, ent_type: str = "table") -> None: if ent_type == "namespace": self.num_namespaces_scanned += 1 @@ -169,7 +183,11 @@ class HBaseSource(StatefulIngestionSourceBase): - Table properties and configuration HBase is a distributed, scalable, big data store built on top of Hadoop. - This connector uses the HBase Thrift API to extract metadata. + This connector uses the HBase Thrift API via the happybase library to extract metadata. + + Requirements: + - HBase Thrift server must be running and accessible + - Install the happybase library: pip install happybase """ config: HBaseSourceConfig @@ -202,29 +220,25 @@ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: def _connect(self) -> bool: """ - Establish connection to HBase via Thrift + Establish connection to HBase via happybase (Thrift) """ try: - # Import HBase Thrift libraries - # Note: This requires happybase or similar HBase Python client - from hbase import Hbase - from thrift.protocol import TBinaryProtocol - from thrift.transport import TSocket, TTransport - - # Create socket - transport = TSocket.TSocket(self.config.host, self.config.port) - - # Wrap in buffered transport - transport = TTransport.TBufferedTransport(transport) - - # Use binary protocol - protocol = TBinaryProtocol.TBinaryProtocol(transport) - - # Create client - self.connection = Hbase.Client(protocol) + # Import happybase library + import happybase + + logger.info(f"Connecting to HBase at {self.config.host}:{self.config.port}") + + # Create connection using happybase + self.connection = happybase.Connection( + host=self.config.host, + port=self.config.port, + timeout=self.config.timeout, + transport="framed" if not self.config.use_ssl else "framed", + protocol="binary", + ) - # Open connection - transport.open() + # Test connection by listing tables + _ = self.connection.tables() logger.info( f"Successfully connected to HBase at {self.config.host}:{self.config.port}" @@ -233,14 +247,24 @@ def _connect(self) -> bool: except ImportError: self.report.failure( - message="Failed to import HBase Thrift libraries. Please install 'happybase' or 'hbase-thrift' package.", + message="Failed to import happybase library. Please install it with: pip install happybase", context="connection", ) logger.error( - "HBase Thrift libraries not found. Install with: pip install happybase" + "happybase library not found. Install with: pip install happybase" ) return False except Exception as e: + # Clean up connection on failure + if self.connection: + try: + self.connection.close() + logger.debug("Closed failed connection attempt") + except Exception as close_error: + logger.debug(f"Error closing failed connection: {close_error}") + finally: + self.connection = None + self.report.failure( message="Failed to connect to HBase", context=f"{self.config.host}:{self.config.port}", @@ -248,22 +272,33 @@ def _connect(self) -> bool: ) return False + def _close_connection(self) -> None: + """ + Internal method to safely close the HBase connection + """ + if self.connection: + try: + self.connection.close() + logger.info("HBase connection closed successfully") + except Exception as e: + logger.warning(f"Error closing HBase connection: {e}") + finally: + self.connection = None + def _get_namespaces(self) -> List[str]: """ Get list of namespaces from HBase """ try: - # HBase Thrift1 doesn't have direct namespace support - # We'll get all tables and extract namespaces from table names - # Table names in HBase can be namespace:table or just table (default namespace) - tables = self.connection.getTableNames() + # Get all tables including their namespaces + tables = self.connection.tables() namespaces = set() for table in tables: table_str = ( table.decode("utf-8") if isinstance(table, bytes) else str(table) ) - if ":" in table_str: + if b":" in table or ":" in table_str: namespace = table_str.split(":", 1)[0] namespaces.add(namespace) else: @@ -280,7 +315,7 @@ def _get_tables_in_namespace(self, namespace: str) -> List[str]: Get all tables in a given namespace """ try: - all_tables = self.connection.getTableNames() + all_tables = self.connection.tables() namespace_tables = [] for table in all_tables: @@ -309,23 +344,26 @@ def _get_tables_in_namespace(self, namespace: str) -> List[str]: def _get_table_descriptor(self, full_table_name: str) -> Optional[Dict]: """ - Get table descriptor including column families + Get table descriptor including column families using happybase """ try: - # Convert to bytes if string - table_bytes = ( + # Convert to bytes if string (happybase expects bytes for table names) + table_name_bytes = ( full_table_name.encode("utf-8") if isinstance(full_table_name, str) else full_table_name ) - # Get column descriptors - descriptors = self.connection.getColumnDescriptors(table_bytes) + # Get table object + table = self.connection.table(table_name_bytes) + + # Get column families from table + families = table.families() # Convert to dict structure result = {"column_families": {}} - for cf_name, cf_descriptor in descriptors.items(): + for cf_name, cf_descriptor in families.items(): cf_name_str = ( cf_name.decode("utf-8") if isinstance(cf_name, bytes) @@ -336,13 +374,29 @@ def _get_table_descriptor(self, full_table_name: str) -> Optional[Dict]: result["column_families"][cf_name_str] = { "name": cf_name_str, - "maxVersions": getattr(cf_descriptor, "maxVersions", 1), - "compression": getattr(cf_descriptor, "compression", "NONE"), - "inMemory": getattr(cf_descriptor, "inMemory", False), - "blockCacheEnabled": getattr( - cf_descriptor, "blockCacheEnabled", True - ), - "timeToLive": getattr(cf_descriptor, "timeToLive", -1), + "maxVersions": cf_descriptor.get(b"VERSIONS", b"1").decode("utf-8") + if isinstance(cf_descriptor.get(b"VERSIONS"), bytes) + else str(cf_descriptor.get("VERSIONS", "1")), + "compression": cf_descriptor.get(b"COMPRESSION", b"NONE").decode( + "utf-8" + ) + if isinstance(cf_descriptor.get(b"COMPRESSION"), bytes) + else str(cf_descriptor.get("COMPRESSION", "NONE")), + "inMemory": cf_descriptor.get(b"IN_MEMORY", b"false").decode( + "utf-8" + ) + == "true" + if isinstance(cf_descriptor.get(b"IN_MEMORY"), bytes) + else cf_descriptor.get("IN_MEMORY", "false") == "true", + "blockCacheEnabled": cf_descriptor.get( + b"BLOCKCACHE", b"true" + ).decode("utf-8") + == "true" + if isinstance(cf_descriptor.get(b"BLOCKCACHE"), bytes) + else cf_descriptor.get("BLOCKCACHE", "true") == "true", + "timeToLive": cf_descriptor.get(b"TTL", b"FOREVER").decode("utf-8") + if isinstance(cf_descriptor.get(b"TTL"), bytes) + else str(cf_descriptor.get("TTL", "FOREVER")), } return result @@ -499,49 +553,53 @@ def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, Entity]]: if not self._connect(): return - # Get all namespaces - namespaces = self._get_namespaces() - - for namespace in namespaces: - # Check if namespace matches pattern - if not self.config.namespace_pattern.allowed(namespace): - self.report.report_dropped(namespace) - continue - - self.report.report_entity_scanned(namespace, ent_type="namespace") - - # Generate namespace container - yield self._generate_namespace_container(namespace) - - # Get tables in namespace - tables = self._get_tables_in_namespace(namespace) - - for table_name in tables: - try: - # Get full table name for HBase API - if namespace == "default": - full_table_name = table_name - else: - full_table_name = f"{namespace}:{table_name}" - - # Get table descriptor - table_descriptor = self._get_table_descriptor(full_table_name) - - # Generate table dataset - dataset = self._generate_table_dataset( - namespace, table_name, table_descriptor - ) - - if dataset: - yield dataset - - except Exception as e: - self.report.num_tables_failed += 1 - self.report.failure( - message="Failed to process table", - context=f"{namespace}:{table_name}", - exc=e, - ) + try: + # Get all namespaces + namespaces = self._get_namespaces() + + for namespace in namespaces: + # Check if namespace matches pattern + if not self.config.namespace_pattern.allowed(namespace): + self.report.report_dropped(namespace) + continue + + self.report.report_entity_scanned(namespace, ent_type="namespace") + + # Generate namespace container + yield self._generate_namespace_container(namespace) + + # Get tables in namespace + tables = self._get_tables_in_namespace(namespace) + + for table_name in tables: + try: + # Get full table name for HBase API + if namespace == "default": + full_table_name = table_name + else: + full_table_name = f"{namespace}:{table_name}" + + # Get table descriptor + table_descriptor = self._get_table_descriptor(full_table_name) + + # Generate table dataset + dataset = self._generate_table_dataset( + namespace, table_name, table_descriptor + ) + + if dataset: + yield dataset + + except Exception as e: + self.report.num_tables_failed += 1 + self.report.failure( + message="Failed to process table", + context=f"{namespace}:{table_name}", + exc=e, + ) + finally: + # Always close connection after processing, even if errors occurred + self._close_connection() def get_report(self) -> HBaseSourceReport: """ @@ -551,14 +609,7 @@ def get_report(self) -> HBaseSourceReport: def close(self) -> None: """ - Clean up resources + Clean up resources and close HBase connection """ - if self.connection: - try: - # Close connection if it has a close method - if hasattr(self.connection, "close"): - self.connection.close() - except Exception as e: - logger.warning(f"Error closing HBase connection: {e}") - + self._close_connection() super().close() diff --git a/metadata-ingestion/tests/integration/hbase/__init__.py b/metadata-ingestion/tests/integration/hbase/__init__.py new file mode 100644 index 00000000000000..f8eff3281a1889 --- /dev/null +++ b/metadata-ingestion/tests/integration/hbase/__init__.py @@ -0,0 +1 @@ +"""Integration tests package for HBase source""" diff --git a/metadata-ingestion/tests/unit/test_hbase_config.py b/metadata-ingestion/tests/unit/test_hbase_config.py new file mode 100644 index 00000000000000..e0cd2bed1fad58 --- /dev/null +++ b/metadata-ingestion/tests/unit/test_hbase_config.py @@ -0,0 +1,237 @@ +""" +Unit tests for HBase source configuration validation +""" + +from typing import Any, Dict + +import pytest + +from datahub.configuration.common import AllowDenyPattern +from datahub.ingestion.source.hbase.hbase import HBaseSourceConfig + + +def _base_config() -> Dict[str, Any]: + """Base configuration for HBase tests.""" + return { + "host": "localhost", + "port": 9090, + } + + +class TestHBaseConfig: + """Test configuration validation and initialization.""" + + def test_valid_config(self): + """Test that valid configuration is accepted.""" + config_dict = _base_config() + config = HBaseSourceConfig.model_validate(config_dict) + + assert config.host == "localhost" + assert config.port == 9090 + assert config.use_ssl is False + assert config.timeout == 30000 + assert config.include_column_families is True + assert config.env == "PROD" + assert config.platform_instance is None + + def test_custom_port(self): + """Test custom port configuration.""" + config_dict = { + **_base_config(), + "port": 9095, + } + config = HBaseSourceConfig.model_validate(config_dict) + assert config.port == 9095 + + def test_invalid_port_too_low(self): + """Test that port below 1 is rejected.""" + config_dict = { + **_base_config(), + "port": 0, + } + with pytest.raises(ValueError, match="Port must be between 1 and 65535"): + HBaseSourceConfig.model_validate(config_dict) + + def test_invalid_port_too_high(self): + """Test that port above 65535 is rejected.""" + config_dict = { + **_base_config(), + "port": 65536, + } + with pytest.raises(ValueError, match="Port must be between 1 and 65535"): + HBaseSourceConfig.model_validate(config_dict) + + def test_valid_port_edge_cases(self): + """Test valid port edge cases (1 and 65535).""" + # Test minimum valid port + config_dict = { + **_base_config(), + "port": 1, + } + config = HBaseSourceConfig.model_validate(config_dict) + assert config.port == 1 + + # Test maximum valid port + config_dict = { + **_base_config(), + "port": 65535, + } + config = HBaseSourceConfig.model_validate(config_dict) + assert config.port == 65535 + + def test_ssl_configuration(self): + """Test SSL configuration.""" + config_dict = { + **_base_config(), + "use_ssl": True, + } + config = HBaseSourceConfig.model_validate(config_dict) + assert config.use_ssl is True + + def test_timeout_configuration(self): + """Test timeout configuration.""" + config_dict = { + **_base_config(), + "timeout": 60000, + } + config = HBaseSourceConfig.model_validate(config_dict) + assert config.timeout == 60000 + + def test_invalid_timeout_negative(self): + """Test that negative timeout is rejected.""" + config_dict = { + **_base_config(), + "timeout": -1, + } + with pytest.raises(ValueError, match="Timeout must be positive"): + HBaseSourceConfig.model_validate(config_dict) + + def test_invalid_timeout_zero(self): + """Test that zero timeout is rejected.""" + config_dict = { + **_base_config(), + "timeout": 0, + } + with pytest.raises(ValueError, match="Timeout must be positive"): + HBaseSourceConfig.model_validate(config_dict) + + def test_namespace_pattern_default(self): + """Test namespace pattern defaults to allow all.""" + config_dict = _base_config() + config = HBaseSourceConfig.model_validate(config_dict) + assert isinstance(config.namespace_pattern, AllowDenyPattern) + assert config.namespace_pattern.allowed("any_namespace") + + def test_namespace_pattern_custom(self): + """Test custom namespace pattern.""" + config_dict = { + **_base_config(), + "namespace_pattern": { + "allow": ["prod_.*"], + "deny": ["prod_test"], + }, + } + config = HBaseSourceConfig.model_validate(config_dict) + assert config.namespace_pattern.allowed("prod_main") + assert not config.namespace_pattern.allowed("prod_test") + assert not config.namespace_pattern.allowed("dev_main") + + def test_table_pattern_default(self): + """Test table pattern defaults to allow all.""" + config_dict = _base_config() + config = HBaseSourceConfig.model_validate(config_dict) + assert isinstance(config.table_pattern, AllowDenyPattern) + assert config.table_pattern.allowed("any_table") + + def test_table_pattern_custom(self): + """Test custom table pattern.""" + config_dict = { + **_base_config(), + "table_pattern": { + "allow": ["users_.*", "products_.*"], + "deny": [".*_temp"], + }, + } + config = HBaseSourceConfig.model_validate(config_dict) + assert config.table_pattern.allowed("users_main") + assert config.table_pattern.allowed("products_catalog") + assert not config.table_pattern.allowed("users_temp") + assert not config.table_pattern.allowed("orders_main") + + def test_include_column_families_default(self): + """Test include_column_families defaults to True.""" + config_dict = _base_config() + config = HBaseSourceConfig.model_validate(config_dict) + assert config.include_column_families is True + + def test_include_column_families_false(self): + """Test include_column_families can be set to False.""" + config_dict = { + **_base_config(), + "include_column_families": False, + } + config = HBaseSourceConfig.model_validate(config_dict) + assert config.include_column_families is False + + def test_env_default(self): + """Test env defaults to PROD.""" + config_dict = _base_config() + config = HBaseSourceConfig.model_validate(config_dict) + assert config.env == "PROD" + + def test_env_custom(self): + """Test custom env value.""" + config_dict = { + **_base_config(), + "env": "DEV", + } + config = HBaseSourceConfig.model_validate(config_dict) + assert config.env == "DEV" + + def test_platform_instance_default(self): + """Test platform_instance defaults to None.""" + config_dict = _base_config() + config = HBaseSourceConfig.model_validate(config_dict) + assert config.platform_instance is None + + def test_platform_instance_custom(self): + """Test custom platform_instance value.""" + config_dict = { + **_base_config(), + "platform_instance": "hbase-cluster-1", + } + config = HBaseSourceConfig.model_validate(config_dict) + assert config.platform_instance == "hbase-cluster-1" + + def test_full_configuration(self): + """Test full configuration with all options.""" + config_dict = { + "host": "hbase.example.com", + "port": 9095, + "use_ssl": True, + "timeout": 45000, + "namespace_pattern": { + "allow": ["prod_.*"], + "deny": ["prod_test"], + }, + "table_pattern": { + "allow": [".*"], + "deny": [".*_backup"], + }, + "include_column_families": True, + "env": "PROD", + "platform_instance": "hbase-prod-cluster", + } + config = HBaseSourceConfig.model_validate(config_dict) + + assert config.host == "hbase.example.com" + assert config.port == 9095 + assert config.use_ssl is True + assert config.timeout == 45000 + assert config.namespace_pattern.allowed("prod_main") + assert not config.namespace_pattern.allowed("prod_test") + assert config.table_pattern.allowed("users") + assert not config.table_pattern.allowed("users_backup") + assert config.include_column_families is True + assert config.env == "PROD" + assert config.platform_instance == "hbase-prod-cluster" diff --git a/metadata-ingestion/tests/unit/test_hbase_source.py b/metadata-ingestion/tests/unit/test_hbase_source.py new file mode 100644 index 00000000000000..2613366a3d0556 --- /dev/null +++ b/metadata-ingestion/tests/unit/test_hbase_source.py @@ -0,0 +1,602 @@ +""" +Unit tests for HBase source logic with mocked HBase connections +""" + +from typing import Any, Dict +from unittest.mock import MagicMock, patch + +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.source.hbase.hbase import ( + HBaseSource, + HBaseSourceConfig, + HBaseSourceReport, +) +from datahub.metadata.schema_classes import ( + BytesTypeClass, + SchemaFieldDataTypeClass, +) +from datahub.sdk.container import Container +from datahub.sdk.dataset import Dataset + + +def _base_config() -> Dict[str, Any]: + """Base configuration for HBase tests.""" + return { + "host": "localhost", + "port": 9090, + } + + +class TestHBaseSource: + """Test HBase source logic with mocked connections.""" + + def test_source_initialization(self): + """Test that source initializes correctly.""" + config = HBaseSourceConfig.model_validate(_base_config()) + ctx = PipelineContext(run_id="test-run") + source = HBaseSource(ctx, config) + + assert source.platform == "hbase" + assert source.config.host == "localhost" + assert source.config.port == 9090 + assert isinstance(source.report, HBaseSourceReport) + assert source.connection is None + + def test_get_platform(self): + """Test get_platform returns correct platform name.""" + config = HBaseSourceConfig.model_validate(_base_config()) + ctx = PipelineContext(run_id="test-run") + source = HBaseSource(ctx, config) + + assert source.get_platform() == "hbase" + + @patch("happybase.Connection") + def test_connect_success(self, mock_connection_class): + """Test successful connection to HBase.""" + # Setup mock + mock_connection = MagicMock() + mock_connection.tables.return_value = [b"test_table"] + mock_connection_class.return_value = mock_connection + + config = HBaseSourceConfig.model_validate(_base_config()) + ctx = PipelineContext(run_id="test-run") + source = HBaseSource(ctx, config) + + # Test connection + result = source._connect() + + assert result is True + assert source.connection is not None + mock_connection_class.assert_called_once_with( + host="localhost", + port=9090, + timeout=30000, + transport="framed", + protocol="binary", + ) + + @patch("happybase.Connection") + def test_connect_import_error(self, mock_connection_class): + """Test connection fails gracefully when happybase is not installed.""" + mock_connection_class.side_effect = ImportError("No module named 'happybase'") + + config = HBaseSourceConfig.model_validate(_base_config()) + ctx = PipelineContext(run_id="test-run") + source = HBaseSource(ctx, config) + + result = source._connect() + + assert result is False + assert len(source.report.failures) == 1 + assert "happybase" in source.report.failures[0]["message"] + + @patch("happybase.Connection") + def test_connect_connection_error(self, mock_connection_class): + """Test connection fails gracefully on connection error.""" + mock_connection_class.side_effect = Exception("Connection refused") + + config = HBaseSourceConfig.model_validate(_base_config()) + ctx = PipelineContext(run_id="test-run") + source = HBaseSource(ctx, config) + + result = source._connect() + + assert result is False + assert len(source.report.failures) == 1 + assert "Failed to connect to HBase" in source.report.failures[0]["message"] + + @patch("happybase.Connection") + def test_get_namespaces_with_default(self, mock_connection_class): + """Test getting namespaces including default namespace.""" + # Setup mock + mock_connection = MagicMock() + mock_connection.tables.return_value = [ + b"table1", # default namespace + b"table2", # default namespace + b"prod:users", # prod namespace + b"prod:orders", # prod namespace + b"dev:test", # dev namespace + ] + mock_connection_class.return_value = mock_connection + + config = HBaseSourceConfig.model_validate(_base_config()) + ctx = PipelineContext(run_id="test-run") + source = HBaseSource(ctx, config) + source._connect() + + namespaces = source._get_namespaces() + + assert len(namespaces) == 3 + assert "default" in namespaces + assert "dev" in namespaces + assert "prod" in namespaces + assert namespaces == ["default", "dev", "prod"] # Should be sorted + + @patch("happybase.Connection") + def test_get_namespaces_error(self, mock_connection_class): + """Test getting namespaces handles errors gracefully.""" + # Setup mock + mock_connection = MagicMock() + mock_connection.tables.side_effect = Exception("Table list error") + mock_connection_class.return_value = mock_connection + + config = HBaseSourceConfig.model_validate(_base_config()) + ctx = PipelineContext(run_id="test-run") + source = HBaseSource(ctx, config) + source._connect() + + namespaces = source._get_namespaces() + + assert namespaces == [] + assert len(source.report.failures) > 0 + + @patch("happybase.Connection") + def test_get_tables_in_default_namespace(self, mock_connection_class): + """Test getting tables in default namespace.""" + # Setup mock + mock_connection = MagicMock() + mock_connection.tables.return_value = [ + b"table1", + b"table2", + b"prod:users", + ] + mock_connection_class.return_value = mock_connection + + config = HBaseSourceConfig.model_validate(_base_config()) + ctx = PipelineContext(run_id="test-run") + source = HBaseSource(ctx, config) + source._connect() + + tables = source._get_tables_in_namespace("default") + + assert len(tables) == 2 + assert "table1" in tables + assert "table2" in tables + + @patch("happybase.Connection") + def test_get_tables_in_named_namespace(self, mock_connection_class): + """Test getting tables in a named namespace.""" + # Setup mock + mock_connection = MagicMock() + mock_connection.tables.return_value = [ + b"table1", + b"prod:users", + b"prod:orders", + b"dev:test", + ] + mock_connection_class.return_value = mock_connection + + config = HBaseSourceConfig.model_validate(_base_config()) + ctx = PipelineContext(run_id="test-run") + source = HBaseSource(ctx, config) + source._connect() + + tables = source._get_tables_in_namespace("prod") + + assert len(tables) == 2 + assert "users" in tables + assert "orders" in tables + + @patch("happybase.Connection") + def test_get_table_descriptor(self, mock_connection_class): + """Test getting table descriptor with column families.""" + # Setup mock + mock_connection = MagicMock() + mock_table = MagicMock() + mock_table.families.return_value = { + b"cf1": { + b"VERSIONS": b"3", + b"COMPRESSION": b"SNAPPY", + b"IN_MEMORY": b"false", + b"BLOCKCACHE": b"true", + b"TTL": b"86400", + }, + b"cf2:": { # Test with trailing colon + b"VERSIONS": b"1", + b"COMPRESSION": b"NONE", + b"IN_MEMORY": b"true", + b"BLOCKCACHE": b"true", + b"TTL": b"FOREVER", + }, + } + mock_connection.table.return_value = mock_table + mock_connection_class.return_value = mock_connection + + config = HBaseSourceConfig.model_validate(_base_config()) + ctx = PipelineContext(run_id="test-run") + source = HBaseSource(ctx, config) + source._connect() + + descriptor = source._get_table_descriptor("test_table") + + assert descriptor is not None + assert "column_families" in descriptor + assert len(descriptor["column_families"]) == 2 + assert "cf1" in descriptor["column_families"] + assert "cf2" in descriptor["column_families"] # Colon should be stripped + + cf1 = descriptor["column_families"]["cf1"] + assert cf1["maxVersions"] == "3" + assert cf1["compression"] == "SNAPPY" + assert cf1["inMemory"] is False + assert cf1["blockCacheEnabled"] is True + assert cf1["timeToLive"] == "86400" + + @patch("happybase.Connection") + def test_get_table_descriptor_error(self, mock_connection_class): + """Test getting table descriptor handles errors gracefully.""" + # Setup mock + mock_connection = MagicMock() + mock_connection.table.side_effect = Exception("Table not found") + mock_connection_class.return_value = mock_connection + + config = HBaseSourceConfig.model_validate(_base_config()) + ctx = PipelineContext(run_id="test-run") + source = HBaseSource(ctx, config) + source._connect() + + descriptor = source._get_table_descriptor("nonexistent_table") + + assert descriptor is None + assert len(source.report.failures) > 0 + + def test_generate_schema_fields(self): + """Test schema field generation from table descriptor.""" + config = HBaseSourceConfig.model_validate(_base_config()) + ctx = PipelineContext(run_id="test-run") + source = HBaseSource(ctx, config) + + table_descriptor = { + "column_families": { + "cf1": {"name": "cf1"}, + "cf2": {"name": "cf2"}, + } + } + + schema_fields = source._generate_schema_fields(table_descriptor) + + assert len(schema_fields) == 3 # rowkey + 2 column families + assert schema_fields[0].fieldPath == "rowkey" + assert schema_fields[0].isPartOfKey is True + assert schema_fields[0].nullable is False + + assert schema_fields[1].fieldPath == "cf1" + assert schema_fields[1].description == "Column family: cf1" + assert schema_fields[1].nullable is True + + assert schema_fields[2].fieldPath == "cf2" + assert schema_fields[2].description == "Column family: cf2" + + def test_convert_hbase_type_to_schema_field_type(self): + """Test HBase type conversion to schema field types.""" + config = HBaseSourceConfig.model_validate(_base_config()) + ctx = PipelineContext(run_id="test-run") + source = HBaseSource(ctx, config) + + # Test bytes type (default) + schema_type = source._convert_hbase_type_to_schema_field_type("bytes") + assert isinstance(schema_type, SchemaFieldDataTypeClass) + assert isinstance(schema_type.type, BytesTypeClass) + + # Test default when unknown type + schema_type = source._convert_hbase_type_to_schema_field_type("unknown") + assert isinstance(schema_type.type, BytesTypeClass) + + def test_generate_namespace_container(self): + """Test namespace container generation.""" + config = HBaseSourceConfig.model_validate(_base_config()) + ctx = PipelineContext(run_id="test-run") + source = HBaseSource(ctx, config) + + container = source._generate_namespace_container("prod") + + assert isinstance(container, Container) + assert container.display_name == "prod" + assert container.qualified_name == "prod" + assert "HBase namespace: prod" in container.description + + @patch("happybase.Connection") + def test_generate_table_dataset(self, mock_connection_class): + """Test table dataset generation.""" + config = HBaseSourceConfig.model_validate(_base_config()) + ctx = PipelineContext(run_id="test-run") + source = HBaseSource(ctx, config) + + table_descriptor = { + "column_families": { + "cf1": { + "maxVersions": "3", + "compression": "SNAPPY", + }, + } + } + + dataset = source._generate_table_dataset("prod", "users", table_descriptor) + + assert isinstance(dataset, Dataset) + assert str(dataset.platform) == "urn:li:dataPlatform:hbase" + # Dataset URN contains the name + assert "prod.users" in str(dataset.urn) + assert dataset.display_name == "users" + assert dataset.qualified_name == "prod:users" + assert "HBase table in namespace 'prod'" in dataset.description + assert dataset.custom_properties is not None + assert dataset.custom_properties["column_families"] == "1" + assert dataset.custom_properties["cf.cf1.maxVersions"] == "3" + assert dataset.custom_properties["cf.cf1.compression"] == "SNAPPY" + + @patch("happybase.Connection") + def test_generate_table_dataset_default_namespace(self, mock_connection_class): + """Test table dataset generation for default namespace.""" + config = HBaseSourceConfig.model_validate(_base_config()) + ctx = PipelineContext(run_id="test-run") + source = HBaseSource(ctx, config) + + table_descriptor = {"column_families": {}} + + dataset = source._generate_table_dataset("default", "table1", table_descriptor) + + # Dataset URN contains the name + assert "table1" in str(dataset.urn) + assert dataset.qualified_name == "table1" + + @patch("happybase.Connection") + def test_generate_table_dataset_filtered(self, mock_connection_class): + """Test table dataset generation with filtering.""" + config_dict = { + **_base_config(), + "table_pattern": { + "allow": ["prod.*"], + "deny": ["prod.test"], + }, + } + config = HBaseSourceConfig.model_validate(config_dict) + ctx = PipelineContext(run_id="test-run") + source = HBaseSource(ctx, config) + + table_descriptor = {"column_families": {}} + + # Should be allowed + dataset = source._generate_table_dataset("prod", "users", table_descriptor) + assert dataset is not None + + # Should be denied + dataset = source._generate_table_dataset("prod", "test", table_descriptor) + assert dataset is None + assert len(source.report.dropped_tables) > 0 + + @patch("happybase.Connection") + def test_generate_table_dataset_without_column_families( + self, mock_connection_class + ): + """Test table dataset generation with include_column_families=False.""" + config_dict = { + **_base_config(), + "include_column_families": False, + } + config = HBaseSourceConfig.model_validate(config_dict) + ctx = PipelineContext(run_id="test-run") + source = HBaseSource(ctx, config) + + table_descriptor = { + "column_families": { + "cf1": {"maxVersions": "3"}, + } + } + + dataset = source._generate_table_dataset("default", "table1", table_descriptor) + + # schema should be None when include_column_families=False + # Can't check dataset.schema directly as it raises error if not set + # Instead check the dataset was created + assert dataset is not None + assert "table1" in str(dataset.urn) + + @patch("happybase.Connection") + def test_get_workunits_internal_success(self, mock_connection_class): + """Test successful work unit generation.""" + # Setup mocks + mock_connection = MagicMock() + mock_connection.tables.return_value = [ + b"table1", + b"prod:users", + ] + + mock_table = MagicMock() + mock_table.families.return_value = { + b"cf1": { + b"VERSIONS": b"1", + b"COMPRESSION": b"NONE", + b"IN_MEMORY": b"false", + b"BLOCKCACHE": b"true", + b"TTL": b"FOREVER", + }, + } + mock_connection.table.return_value = mock_table + mock_connection_class.return_value = mock_connection + + config = HBaseSourceConfig.model_validate(_base_config()) + ctx = PipelineContext(run_id="test-run") + source = HBaseSource(ctx, config) + + # Get workunits + workunits = list(source.get_workunits_internal()) + + # Should have 2 containers (default, prod) + 2 datasets + assert len(workunits) == 4 + assert source.report.num_namespaces_scanned == 2 + assert source.report.num_tables_scanned == 2 + + @patch("happybase.Connection") + def test_get_workunits_internal_with_namespace_filter(self, mock_connection_class): + """Test work unit generation with namespace filtering.""" + # Setup mocks + mock_connection = MagicMock() + mock_connection.tables.return_value = [ + b"table1", + b"prod:users", + b"dev:test", + ] + + mock_table = MagicMock() + mock_table.families.return_value = { + b"cf1": { + b"VERSIONS": b"1", + b"COMPRESSION": b"NONE", + b"IN_MEMORY": b"false", + b"BLOCKCACHE": b"true", + b"TTL": b"FOREVER", + }, + } + mock_connection.table.return_value = mock_table + mock_connection_class.return_value = mock_connection + + config_dict = { + **_base_config(), + "namespace_pattern": { + "allow": ["prod"], + }, + } + config = HBaseSourceConfig.model_validate(config_dict) + ctx = PipelineContext(run_id="test-run") + source = HBaseSource(ctx, config) + + # Get workunits + workunits = list(source.get_workunits_internal()) + + # Should only have prod namespace (1 container + 1 dataset) + assert len(workunits) == 2 + assert source.report.num_namespaces_scanned == 1 + assert len(source.report.dropped_namespaces) == 2 # default and dev + + @patch("happybase.Connection") + def test_get_workunits_internal_connection_failure(self, mock_connection_class): + """Test work unit generation when connection fails.""" + mock_connection_class.side_effect = Exception("Connection failed") + + config = HBaseSourceConfig.model_validate(_base_config()) + ctx = PipelineContext(run_id="test-run") + source = HBaseSource(ctx, config) + + # Get workunits + workunits = list(source.get_workunits_internal()) + + assert len(workunits) == 0 + assert len(source.report.failures) > 0 + + @patch("happybase.Connection") + def test_close_with_connection(self, mock_connection_class): + """Test closing source with active connection.""" + mock_connection = MagicMock() + mock_connection.tables.return_value = [b"test"] + mock_connection_class.return_value = mock_connection + + config = HBaseSourceConfig.model_validate(_base_config()) + ctx = PipelineContext(run_id="test-run") + source = HBaseSource(ctx, config) + source._connect() + + # Close source + source.close() + + # Verify connection close was called + mock_connection.close.assert_called_once() + + def test_close_without_connection(self): + """Test closing source without active connection.""" + config = HBaseSourceConfig.model_validate(_base_config()) + ctx = PipelineContext(run_id="test-run") + source = HBaseSource(ctx, config) + + # Should not raise error + source.close() + + def test_get_report(self): + """Test getting ingestion report.""" + config = HBaseSourceConfig.model_validate(_base_config()) + ctx = PipelineContext(run_id="test-run") + source = HBaseSource(ctx, config) + + report = source.get_report() + + assert isinstance(report, HBaseSourceReport) + assert report.num_namespaces_scanned == 0 + assert report.num_tables_scanned == 0 + assert report.num_tables_failed == 0 + + +class TestHBaseSourceReport: + """Test HBase source report functionality.""" + + def test_report_entity_scanned_namespace(self): + """Test reporting scanned namespace.""" + report = HBaseSourceReport() + report.report_entity_scanned("prod", ent_type="namespace") + + assert report.num_namespaces_scanned == 1 + assert report.num_tables_scanned == 0 + + def test_report_entity_scanned_table(self): + """Test reporting scanned table.""" + report = HBaseSourceReport() + report.report_entity_scanned("prod.users", ent_type="table") + + assert report.num_namespaces_scanned == 0 + assert report.num_tables_scanned == 1 + + def test_report_dropped_namespace(self): + """Test reporting dropped namespace.""" + report = HBaseSourceReport() + report.report_dropped("dev") + + assert len(report.dropped_namespaces) == 1 + assert "dev" in report.dropped_namespaces + + def test_report_dropped_table(self): + """Test reporting dropped table.""" + report = HBaseSourceReport() + report.report_dropped("prod.users") + + assert len(report.dropped_tables) == 1 + assert "prod.users" in report.dropped_tables + + def test_failure_reporting(self): + """Test failure reporting.""" + report = HBaseSourceReport() + report.failure( + message="Test failure", context="test_context", exc=Exception("Test error") + ) + + assert len(report.failures) == 1 + assert report.failures[0]["message"] == "Test failure" + assert report.failures[0]["context"] == "test_context" + assert "Test error" in report.failures[0]["exception"] + + def test_warning_reporting(self): + """Test warning reporting.""" + report = HBaseSourceReport() + report.warning(message="Test warning", context="test_context") + + assert len(report.warnings) == 1 + assert report.warnings[0]["message"] == "Test warning" + assert report.warnings[0]["context"] == "test_context" From e0b79f17c812228d2908ed9a05226ec29b6c77a1 Mon Sep 17 00:00:00 2001 From: btkcodedev Date: Wed, 19 Nov 2025 10:23:20 +0530 Subject: [PATCH 5/5] fix: integration test with mock --- .../tests/integration/hbase/README.md | 82 ++++++ .../hbase/test_hbase_integration.py | 239 ++++++++++++++++++ 2 files changed, 321 insertions(+) create mode 100644 metadata-ingestion/tests/integration/hbase/README.md create mode 100644 metadata-ingestion/tests/integration/hbase/test_hbase_integration.py diff --git a/metadata-ingestion/tests/integration/hbase/README.md b/metadata-ingestion/tests/integration/hbase/README.md new file mode 100644 index 00000000000000..fdb1e27eb2ea5f --- /dev/null +++ b/metadata-ingestion/tests/integration/hbase/README.md @@ -0,0 +1,82 @@ +# HBase Integration Tests + +This directory contains integration tests for the HBase source connector. + +**Note**: These tests use **mocked HBase connections** (no real HBase infrastructure required). They test the integration logic and data flow, while unit tests in `tests/unit/test_hbase_*.py` validate individual components. + +## Overview + +The integration tests use `unittest.mock` to simulate HBase connections via the `happybase` library. This approach: + +- Runs quickly (~0.2 seconds for all 9 tests) +- Requires no external infrastructure or Docker +- Tests the integration logic between DataHub SDK and HBase source +- Validates end-to-end data flow with mocked data + +For testing against a real HBase cluster, manual verification is required. + +## Prerequisites + +- Python packages: `pytest` +- No HBase installation or Docker required + +## Running Tests + +```bash +# All integration tests (runs in ~0.2 seconds) +PYTHONPATH=src pytest tests/integration/hbase/ -v + +# Specific test +PYTHONPATH=src pytest tests/integration/hbase/test_hbase_integration.py::TestHBaseIntegration::test_connection_to_hbase -v + +# With verbose output +PYTHONPATH=src pytest tests/integration/hbase/ -v -s +``` + +## Test Coverage + +All tests use mocked HBase connections and validate: + +1. **test_connection_to_hbase** - Connection initialization with correct parameters +2. **test_get_namespaces** - Namespace extraction from table names +3. **test_get_tables_in_default_namespace** - Table discovery in default namespace +4. **test_get_table_descriptor** - Column family metadata extraction +5. **test_full_ingestion** - Complete ingestion workflow with containers and datasets +6. **test_ingestion_with_namespace_filter** - Namespace pattern filtering +7. **test_ingestion_with_table_filter** - Table pattern filtering +8. **test_schema_extraction** - Schema generation from column families +9. **test_custom_properties** - Custom property population + +## Test Data + +The mock fixture provides: + +- **Tables:** test_table1 (cf1, cf2), test_table2 (info, data) +- **Namespace:** default (extracted from table names) +- **Column Families:** Various configurations with MAX_VERSIONS properties + +## Testing Against Real HBase + +For testing against a real HBase cluster: + +1. Set up HBase with Thrift server enabled on port 9090 +2. Manually test connection: + +```bash +python -c "import happybase; conn = happybase.Connection('localhost', 9090); print(conn.tables()); conn.close()" +``` + +3. Run the source with a test config: + +```bash +datahub ingest -c your_hbase_config.yml +``` + +## Architecture + +The tests follow DataHub's integration test patterns (similar to Fivetran, etc.): + +- Use `@pytest.fixture` with `mock.patch` to mock external dependencies +- Test integration logic without requiring real infrastructure +- Unit tests (50 tests in `tests/unit/`) provide component-level validation +- Integration tests (9 tests here) validate end-to-end data flow diff --git a/metadata-ingestion/tests/integration/hbase/test_hbase_integration.py b/metadata-ingestion/tests/integration/hbase/test_hbase_integration.py new file mode 100644 index 00000000000000..aa8d6f972850a8 --- /dev/null +++ b/metadata-ingestion/tests/integration/hbase/test_hbase_integration.py @@ -0,0 +1,239 @@ +""" +Integration tests for HBase source + +NOTE: These tests use mocked HBase connections (no real HBase required). +They test the integration logic and data flow, while unit tests validate individual components. +Real HBase testing requires manual verification with an actual HBase cluster. +""" + +from typing import List +from unittest import mock + +import pytest + +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.source.hbase.hbase import HBaseSource, HBaseSourceConfig +from datahub.sdk.container import Container +from datahub.sdk.dataset import Dataset +from datahub.sdk.entity import Entity + +# Mark all tests in this module as integration tests +pytestmark = pytest.mark.integration + + +@pytest.fixture +def mock_hbase_connection(): + """Mock happybase connection for testing without real HBase.""" + with mock.patch("happybase.Connection") as mock_conn: + # Mock connection instance + conn_instance = mock.MagicMock() + + # Mock tables() to return test tables + conn_instance.tables.return_value = [ + b"test_table1", + b"test_table2", + ] + + # Mock table() to return table instance with families() + def mock_table(name): + table_mock = mock.MagicMock() + if name == b"test_table1": + table_mock.families.return_value = { + b"cf1": {b"MAX_VERSIONS": b"3"}, + b"cf2": {b"MAX_VERSIONS": b"1"}, + } + elif name == b"test_table2": + table_mock.families.return_value = { + b"info": {b"MAX_VERSIONS": b"1"}, + b"data": {b"MAX_VERSIONS": b"1"}, + } + return table_mock + + conn_instance.table.side_effect = mock_table + mock_conn.return_value = conn_instance + + yield mock_conn + + +class TestHBaseIntegration: + """Integration tests for HBase source with mocked connections.""" + + def test_connection_to_hbase(self, mock_hbase_connection): + """Test that we can connect to HBase.""" + config = HBaseSourceConfig(host="localhost", port=9090) + ctx = PipelineContext(run_id="test-integration") + source = HBaseSource(ctx, config) + + result = source._connect() + assert result is True + assert source.connection is not None + + # Verify connection was called with correct params + mock_hbase_connection.assert_called_once() + call_kwargs = mock_hbase_connection.call_args[1] + assert call_kwargs["host"] == "localhost" + assert call_kwargs["port"] == 9090 + + source.close() + + def test_get_namespaces(self, mock_hbase_connection): + """Test getting namespaces from HBase.""" + config = HBaseSourceConfig(host="localhost", port=9090) + ctx = PipelineContext(run_id="test-integration") + source = HBaseSource(ctx, config) + + source._connect() + namespaces = source._get_namespaces() + + # Should extract default namespace from table names + assert "default" in namespaces + assert len(namespaces) >= 1 + + source.close() + + def test_get_tables_in_default_namespace(self, mock_hbase_connection): + """Test getting tables in default namespace.""" + config = HBaseSourceConfig(host="localhost", port=9090) + ctx = PipelineContext(run_id="test-integration") + source = HBaseSource(ctx, config) + + source._connect() + tables = source._get_tables_in_namespace("default") + + assert "test_table1" in tables + assert "test_table2" in tables + assert len(tables) >= 2 + + source.close() + + def test_get_table_descriptor(self, mock_hbase_connection): + """Test getting table descriptor with column families.""" + config = HBaseSourceConfig(host="localhost", port=9090) + ctx = PipelineContext(run_id="test-integration") + source = HBaseSource(ctx, config) + + source._connect() + descriptor = source._get_table_descriptor("test_table1") + + assert descriptor is not None + assert "column_families" in descriptor + assert "cf1" in descriptor["column_families"] + assert "cf2" in descriptor["column_families"] + + source.close() + + def test_full_ingestion(self, mock_hbase_connection): + """Test full ingestion process.""" + config = HBaseSourceConfig(host="localhost", port=9090) + ctx = PipelineContext(run_id="test-integration") + source = HBaseSource(ctx, config) + + workunits: List[Entity] = list(source.get_workunits_internal()) + + # Should have containers for namespaces and datasets for tables + containers = [wu for wu in workunits if isinstance(wu, Container)] + datasets = [wu for wu in workunits if isinstance(wu, Dataset)] + + # At least default namespace container + assert len(containers) >= 1 + + # At least our 2 test tables + assert len(datasets) >= 2 + + # Check that default namespace is present + namespace_names = {c.display_name for c in containers} + assert "default" in namespace_names + + # Check that test tables are present + table_names = {d.display_name for d in datasets} + assert "test_table1" in table_names + assert "test_table2" in table_names + + # Check report + report = source.get_report() + assert report.num_namespaces_scanned >= 1 + assert report.num_tables_scanned >= 2 + assert report.num_tables_failed == 0 + + source.close() + + def test_ingestion_with_namespace_filter(self, mock_hbase_connection): + """Test ingestion with namespace filtering.""" + config = HBaseSourceConfig( + host="localhost", + port=9090, + namespace_pattern={"allow": ["default"]}, + ) + ctx = PipelineContext(run_id="test-integration") + source = HBaseSource(ctx, config) + + workunits: List[Entity] = list(source.get_workunits_internal()) + + containers = [wu for wu in workunits if isinstance(wu, Container)] + datasets = [wu for wu in workunits if isinstance(wu, Dataset)] + + # Should only have default namespace + assert len(containers) == 1 + assert containers[0].display_name == "default" + + # Should have default namespace tables + assert len(datasets) >= 2 + table_names = {d.display_name for d in datasets} + assert "test_table1" in table_names + assert "test_table2" in table_names + + source.close() + + def test_ingestion_with_table_filter(self, mock_hbase_connection): + """Test ingestion with table filtering.""" + config = HBaseSourceConfig( + host="localhost", + port=9090, + table_pattern={"allow": ["test_table1"]}, + ) + ctx = PipelineContext(run_id="test-integration") + source = HBaseSource(ctx, config) + + workunits: List[Entity] = list(source.get_workunits_internal()) + + datasets = [wu for wu in workunits if isinstance(wu, Dataset)] + + # Should only have 1 table + assert len(datasets) == 1 + table_names = {d.display_name for d in datasets} + assert "test_table1" in table_names + + source.close() + + def test_schema_extraction(self, mock_hbase_connection): + """Test schema extraction from column families.""" + config = HBaseSourceConfig(host="localhost", port=9090) + ctx = PipelineContext(run_id="test-integration") + source = HBaseSource(ctx, config) + + source._connect() + descriptor = source._get_table_descriptor("test_table1") + + # Verify column families are extracted + assert "cf1" in descriptor["column_families"] + assert "cf2" in descriptor["column_families"] + + source.close() + + def test_custom_properties(self, mock_hbase_connection): + """Test custom properties are captured.""" + config = HBaseSourceConfig(host="localhost", port=9090) + ctx = PipelineContext(run_id="test-integration") + source = HBaseSource(ctx, config) + + workunits: List[Entity] = list(source.get_workunits_internal()) + datasets = [wu for wu in workunits if isinstance(wu, Dataset)] + + # Verify datasets have custom properties + assert len(datasets) > 0 + for dataset in datasets: + assert hasattr(dataset, "custom_properties") + # Should have some custom properties populated + assert len(dataset.custom_properties) > 0 + + source.close()