<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0">
  <channel>
    <title>Shiwon Kim</title>
    <link>https://cool-kim.tistory.com/</link>
    <description></description>
    <language>ko</language>
    <pubDate>Tue, 7 Apr 2026 10:48:45 +0900</pubDate>
    <generator>TISTORY</generator>
    <ttl>100</ttl>
    <managingEditor>gool</managingEditor>
    <image>
      <title>Shiwon Kim</title>
      <url>https://tistory1.daumcdn.net/tistory/3839558/attach/0bae37eda2934bba84ed358ffc8b301c</url>
      <link>https://cool-kim.tistory.com</link>
    </image>
    <item>
      <title>02. 시스템 소프트웨어 및 윈도우와 컴퓨터의 단위</title>
      <link>https://cool-kim.tistory.com/66</link>
      <description>&lt;p&gt;&lt;figure class=&quot;fileblock&quot; data-ke-align=&quot;alignCenter&quot;&gt;&lt;a href=&quot;https://blog.kakaocdn.net/dn/Fb19t/btrQ0GbmnaG/Thw7S5oWdzj5x8ZIIjteck/IT%ED%86%B5%ED%95%A9%EA%B8%B0%EC%B4%88.pdf?attach=1&amp;amp;knm=tfile.pdf&quot; class=&quot;&quot;&gt;
    &lt;div class=&quot;image&quot;&gt;&lt;/div&gt;
    &lt;div class=&quot;desc&quot;&gt;&lt;div class=&quot;filename&quot;&gt;&lt;span class=&quot;name&quot;&gt;IT통합기초.pdf&lt;/span&gt;&lt;/div&gt;
&lt;div class=&quot;size&quot;&gt;2.17MB&lt;/div&gt;
&lt;/div&gt;
  &lt;/a&gt;&lt;/figure&gt;
&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p&gt;&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-origin-width=&quot;574&quot; data-origin-height=&quot;852&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/GuDge/btrQZQljH4X/UiEJcUdEowgs7fWO582iwK/img.png&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/GuDge/btrQZQljH4X/UiEJcUdEowgs7fWO582iwK/img.png&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/GuDge/btrQZQljH4X/UiEJcUdEowgs7fWO582iwK/img.png&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FGuDge%2FbtrQZQljH4X%2FUiEJcUdEowgs7fWO582iwK%2Fimg.png&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;574&quot; height=&quot;852&quot; data-origin-width=&quot;574&quot; data-origin-height=&quot;852&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-origin-width=&quot;587&quot; data-origin-height=&quot;849&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/Jm7Op/btrQZtjDYzD/n3DX9KcrgEIfZ1pK90yLo0/img.png&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/Jm7Op/btrQZtjDYzD/n3DX9KcrgEIfZ1pK90yLo0/img.png&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/Jm7Op/btrQZtjDYzD/n3DX9KcrgEIfZ1pK90yLo0/img.png&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FJm7Op%2FbtrQZtjDYzD%2Fn3DX9KcrgEIfZ1pK90yLo0%2Fimg.png&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;587&quot; height=&quot;849&quot; data-origin-width=&quot;587&quot; data-origin-height=&quot;849&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-origin-width=&quot;591&quot; data-origin-height=&quot;852&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/crjHWY/btrQYXS43Ws/C54b28K2YkkQN5kMkDjQB1/img.png&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/crjHWY/btrQYXS43Ws/C54b28K2YkkQN5kMkDjQB1/img.png&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/crjHWY/btrQYXS43Ws/C54b28K2YkkQN5kMkDjQB1/img.png&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FcrjHWY%2FbtrQYXS43Ws%2FC54b28K2YkkQN5kMkDjQB1%2Fimg.png&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;591&quot; height=&quot;852&quot; data-origin-width=&quot;591&quot; data-origin-height=&quot;852&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-origin-width=&quot;589&quot; data-origin-height=&quot;839&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/7dGgV/btrQYssxalH/NYFtOCkJOeXxcokEK3yMA1/img.png&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/7dGgV/btrQYssxalH/NYFtOCkJOeXxcokEK3yMA1/img.png&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/7dGgV/btrQYssxalH/NYFtOCkJOeXxcokEK3yMA1/img.png&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2F7dGgV%2FbtrQYssxalH%2FNYFtOCkJOeXxcokEK3yMA1%2Fimg.png&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;589&quot; height=&quot;839&quot; data-origin-width=&quot;589&quot; data-origin-height=&quot;839&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-origin-width=&quot;595&quot; data-origin-height=&quot;844&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/b1OvQu/btrQZWMxNHy/t55Bq7aCCroCjTMWUaG0BK/img.png&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/b1OvQu/btrQZWMxNHy/t55Bq7aCCroCjTMWUaG0BK/img.png&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/b1OvQu/btrQZWMxNHy/t55Bq7aCCroCjTMWUaG0BK/img.png&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2Fb1OvQu%2FbtrQZWMxNHy%2Ft55Bq7aCCroCjTMWUaG0BK%2Fimg.png&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;595&quot; height=&quot;844&quot; data-origin-width=&quot;595&quot; data-origin-height=&quot;844&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-origin-width=&quot;596&quot; data-origin-height=&quot;839&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/EsRey/btrQYYxFrlM/VCpOI2wa5vFsluiEomPCj0/img.png&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/EsRey/btrQYYxFrlM/VCpOI2wa5vFsluiEomPCj0/img.png&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/EsRey/btrQYYxFrlM/VCpOI2wa5vFsluiEomPCj0/img.png&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FEsRey%2FbtrQYYxFrlM%2FVCpOI2wa5vFsluiEomPCj0%2Fimg.png&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;596&quot; height=&quot;839&quot; data-origin-width=&quot;596&quot; data-origin-height=&quot;839&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;</description>
      <category>데이터 사이언스/IT 기초</category>
      <author>gool</author>
      <guid isPermaLink="true">https://cool-kim.tistory.com/66</guid>
      <comments>https://cool-kim.tistory.com/66#entry66comment</comments>
      <pubDate>Fri, 11 Nov 2022 18:09:05 +0900</pubDate>
    </item>
    <item>
      <title>01. 클라이언트 서버 구조 및 컴퓨터의 구성</title>
      <link>https://cool-kim.tistory.com/65</link>
      <description>&lt;p&gt;&lt;figure class=&quot;fileblock&quot; data-ke-align=&quot;alignCenter&quot;&gt;&lt;a href=&quot;https://blog.kakaocdn.net/dn/yhASw/btrQ0GWFUHx/Mrren9QTlijKCv4S21wco0/IT%ED%86%B5%ED%95%A9%EA%B8%B0%EC%B4%88.pdf?attach=1&amp;amp;knm=tfile.pdf&quot; class=&quot;&quot;&gt;
    &lt;div class=&quot;image&quot;&gt;&lt;/div&gt;
    &lt;div class=&quot;desc&quot;&gt;&lt;div class=&quot;filename&quot;&gt;&lt;span class=&quot;name&quot;&gt;IT통합기초.pdf&lt;/span&gt;&lt;/div&gt;
&lt;div class=&quot;size&quot;&gt;2.17MB&lt;/div&gt;
&lt;/div&gt;
  &lt;/a&gt;&lt;/figure&gt;
&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p&gt;&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-origin-width=&quot;587&quot; data-origin-height=&quot;865&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/bJkZP1/btrQZhQ7LSQ/hJwXKVHlt5EP9sNKYi0ZFk/img.png&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/bJkZP1/btrQZhQ7LSQ/hJwXKVHlt5EP9sNKYi0ZFk/img.png&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/bJkZP1/btrQZhQ7LSQ/hJwXKVHlt5EP9sNKYi0ZFk/img.png&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FbJkZP1%2FbtrQZhQ7LSQ%2FhJwXKVHlt5EP9sNKYi0ZFk%2Fimg.png&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;587&quot; height=&quot;865&quot; data-origin-width=&quot;587&quot; data-origin-height=&quot;865&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-origin-width=&quot;589&quot; data-origin-height=&quot;853&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/FfaTO/btrQYEl1Puw/xW6LaL2NNPUJX6hBWV5kmk/img.png&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/FfaTO/btrQYEl1Puw/xW6LaL2NNPUJX6hBWV5kmk/img.png&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/FfaTO/btrQYEl1Puw/xW6LaL2NNPUJX6hBWV5kmk/img.png&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FFfaTO%2FbtrQYEl1Puw%2FxW6LaL2NNPUJX6hBWV5kmk%2Fimg.png&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;589&quot; height=&quot;853&quot; data-origin-width=&quot;589&quot; data-origin-height=&quot;853&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-origin-width=&quot;586&quot; data-origin-height=&quot;853&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/blDwh5/btrQZch4GJP/6o78M5RoicPPrBZkjwbhbK/img.png&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/blDwh5/btrQZch4GJP/6o78M5RoicPPrBZkjwbhbK/img.png&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/blDwh5/btrQZch4GJP/6o78M5RoicPPrBZkjwbhbK/img.png&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FblDwh5%2FbtrQZch4GJP%2F6o78M5RoicPPrBZkjwbhbK%2Fimg.png&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;586&quot; height=&quot;853&quot; data-origin-width=&quot;586&quot; data-origin-height=&quot;853&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-origin-width=&quot;590&quot; data-origin-height=&quot;836&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/dcbcET/btrQYYEn3k8/MheilbyRja1zAK0p5hfEh0/img.png&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/dcbcET/btrQYYEn3k8/MheilbyRja1zAK0p5hfEh0/img.png&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/dcbcET/btrQYYEn3k8/MheilbyRja1zAK0p5hfEh0/img.png&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FdcbcET%2FbtrQYYEn3k8%2FMheilbyRja1zAK0p5hfEh0%2Fimg.png&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;590&quot; height=&quot;836&quot; data-origin-width=&quot;590&quot; data-origin-height=&quot;836&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;</description>
      <category>데이터 사이언스/IT 기초</category>
      <author>gool</author>
      <guid isPermaLink="true">https://cool-kim.tistory.com/65</guid>
      <comments>https://cool-kim.tistory.com/65#entry65comment</comments>
      <pubDate>Fri, 11 Nov 2022 18:05:06 +0900</pubDate>
    </item>
    <item>
      <title>07. 웹 크롤링과 법적 문제</title>
      <link>https://cool-kim.tistory.com/64</link>
      <description>&lt;blockquote data-ke-size=&quot;size16&quot; data-ke-style=&quot;style1&quot;&gt;웹 크롤링 시 법적 권고안을 무시하면 법적인 문제가 발생할 수 있다.&lt;/blockquote&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;웹 크롤링(Web Crawling)은 인터넷 서비스 '웹' 브라우저를 통해 인터넷을 돌아다니면서&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;인터넷 상의 정보를 가져가는 행위를 거미가 거미줄을 위를 기어다니는 것처럼 표현한 것이다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;* 웹(Web)은 FTP처럼 대중적으로 사용되는 인터넷 서비스 또는 규약의 한 종류&lt;span&gt;(웹&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;/span&gt;&amp;ne; 인터넷)&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;웹 상의 정보에 대한 권리는 서비스 공급자에게 있다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;웹 크롤링을 수행하는 클라이언트는 &lt;b&gt;정보 제공자의 요구&lt;/b&gt;를 반드시 받아들여야 할 의무는 없지만,&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;접근 금지를 요청하거나 법적인 문제를 제기&lt;/b&gt;하는 주체가 공급자이므로 받아들이는 것이 좋다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;robot.txt&lt;/b&gt;는 관리자가 크롤링을 허용하는 정보와 허용하지 않는 정보를 명시해놓은 파일이다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;페이지 URL 주소 뒤에 /robots.txt를 붙이면 robots.txt 파일을 확인할 수 있다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;a href=&quot;https://www.google.com/robots.txt&quot; target=&quot;_blank&quot; rel=&quot;noopener&quot;&gt;https://www.google.com/robots.txt&lt;/a&gt;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;'Disallow' 표시가 된 것은 Google 페이지의 관리자가 &lt;b&gt;크롤링하지 않을 것을 요구&lt;/b&gt;한 항목이다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;크롤링 행위가 직접 법적인 문제로 이어지지는 않지만, 해당&amp;nbsp;&lt;b&gt;요구사항은 지킬 것이 권고&lt;/b&gt;된다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;a href=&quot;https://www.google.com/search?q=%EC%82%AC%EB%9E%8C%EC%9D%B8+%EC%9E%A1%EC%BD%94%EB%A6%AC%EC%95%84+%ED%81%AC%EB%A1%A4%EB%A7%81&amp;amp;rlz=1C1NHXL_koUS841US841&amp;amp;oq=%EC%82%AC%EB%9E%8C%EC%9D%B8+%EC%9E%A1%EC%BD%94%EB%A6%AC%EC%95%84+%ED%81%AC%EB%A1%A4%EB%A7%81&amp;amp;aqs=chrome.0.0i512l10.2715j0j4&amp;amp;sourceid=chrome&amp;amp;ie=UTF-8&quot; target=&quot;_blank&quot; rel=&quot;noopener&quot;&gt;사람인 잡코리아 크롤링&lt;/a&gt;, &lt;a href=&quot;https://www.google.com/search?q=%EC%97%AC%EA%B8%B0%EC%96%B4%EB%95%8C+%EC%95%BC%EB%86%80%EC%9E%90+%ED%81%AC%EB%A1%A4%EB%A7%81&amp;amp;rlz=1C1NHXL_koUS841US841&amp;amp;ei=ej7aYqWsBLmz2roPi6qFoAs&amp;amp;ved=0ahUKEwil0cie6ov5AhW5mVYBHQtVAbQQ4dUDCA4&amp;amp;uact=5&amp;amp;oq=%EC%97%AC%EA%B8%B0%EC%96%B4%EB%95%8C+%EC%95%BC%EB%86%80%EC%9E%90+%ED%81%AC%EB%A1%A4%EB%A7%81&amp;amp;gs_lcp=Cgdnd3Mtd2l6EAM6BwgAEEcQsAM6BQguEIAEOgsIABCABBCxAxCDAToFCAAQgAQ6EQguEIAEELEDEIMBEMcBENEDOgQILhADOgQIABADOhQIABDqAhC0AhCKAxC3AxDUAxDlAjoRCAAQ6gIQtAIQigMQtwMQ5QI6CAgAEIAEELEDOgsILhCABBCxAxDUAjoLCC4QgAQQxwEQrwFKBAhBGABKBAhGGABQyQZY2h9g1yBoCnABeASAAe0BiAGRGZIBBjAuMjMuMZgBAKABAbABBcgBCsABAQ&amp;amp;sclient=gws-wiz&quot; target=&quot;_blank&quot; rel=&quot;noopener&quot;&gt;여기어때 야놀자 크롤링&lt;/a&gt;, &lt;a href=&quot;https://www.google.com/search?q=%EB%8B%A4%EC%9C%88%ED%94%84%EB%A1%9C%ED%8D%BC%ED%8B%B0+%EB%84%A4%EC%9D%B4%EB%B2%84+%ED%81%AC%EB%A1%A4%EB%A7%81&amp;amp;rlz=1C1NHXL_koUS841US841&amp;amp;ei=lT_aYsiiAua02roPvpaSqAs&amp;amp;ved=0ahUKEwiIwb-l64v5AhVmmlYBHT6LBLUQ4dUDCA4&amp;amp;uact=5&amp;amp;oq=%EB%8B%A4%EC%9C%88%ED%94%84%EB%A1%9C%ED%8D%BC%ED%8B%B0+%EB%84%A4%EC%9D%B4%EB%B2%84+%ED%81%AC%EB%A1%A4%EB%A7%81&amp;amp;gs_lcp=Cgdnd3Mtd2l6EAMyBQghEKABOgsIABCABBCxAxCDAToRCC4QgAQQsQMQgwEQxwEQ0QM6BQguEIAEOggIABCABBCxAzoICC4QgAQQ1AI6BQgAEIAEOgsILhCABBDHARCvAToECAAQHjoGCAAQHhAPOgYIABAeEAVKBAhBGABKBAhGGABQAFjsHWDXHmgFcAB4BoABpwGIAdIdkgEEMS4yOZgBAKABAcABAQ&amp;amp;sclient=gws-wiz&quot; target=&quot;_blank&quot; rel=&quot;noopener&quot;&gt;다윈프로퍼티 네이버 크롤링&lt;/a&gt; 등은&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;웹 크롤링 행위가 서비스 관리자의 조치로 법적인 문제로까지 이어진 예시이다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;이처럼 크롤링 행위 자체가 불법은 아니지만,&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;사업체 소속으로 상대방의 사업 영역을 침해할 경우 충분히 법적인 문제로 이어질 수 있다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;</description>
      <category>데이터 사이언스/웹 크롤링</category>
      <author>gool</author>
      <guid isPermaLink="true">https://cool-kim.tistory.com/64</guid>
      <comments>https://cool-kim.tistory.com/64#entry64comment</comments>
      <pubDate>Fri, 22 Jul 2022 15:13:19 +0900</pubDate>
    </item>
    <item>
      <title>06. 자동화 도구: Selenium</title>
      <link>https://cool-kim.tistory.com/63</link>
      <description>&lt;blockquote data-ke-size=&quot;size16&quot; data-ke-style=&quot;style1&quot;&gt;Selenium은 마우스나 키보드를 통한 브라우저 조작을 자동화하는 도구이다.&lt;/blockquote&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;Selenium은 사용자에 의해 이루어지는 웹 브라우저의 조작 및 제어를 자동화하는 도구이며,&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;사용자의 입력에 따라 동적으로 움직이는 페이지를 크롤링하는 &lt;b&gt;동적 페이지 크롤링&lt;/b&gt;에 사용된다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;Selenium 사용 시 &lt;b&gt;웹 브라우저 제어 도구인 드라이버(Driver)&lt;/b&gt;로 별도의 창을 열게 되므로,&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;웹 브라우저를 실행하지 않고 페이지 소스만을 가져오는 requests 모듈에 비해 속도가 느리다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;그러나 requests 모듈의 경우 정적 웹 문서에 대한 크롤링만 가능한 반면,&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;selenium은 마우스 클릭 및 스크롤링, 키보드 입력 등 &lt;b&gt;모든 동적 활동에 대응&lt;/b&gt;할 수 있다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;따라서 동적 페이지의 비중이 커진 요즘의 웹 크롤링 작업에서 활용도가 높다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;아래 링크에서 selenium 자동화 도구를 활용한 동적 페이지 크롤링 실습 코드를 확인할 수 있다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;a href=&quot;https://github.com/tldnjs1231/web-crawling/blob/main/web-crawling-06-selenium.ipynb&quot; target=&quot;_blank&quot; rel=&quot;noopener&quot;&gt;https://github.com/tldnjs1231/web-crawling/blob/main/web-crawling-06-selenium.ipynb&lt;/a&gt;&lt;/p&gt;
&lt;figure id=&quot;og_1658467565814&quot; contenteditable=&quot;false&quot; data-ke-type=&quot;opengraph&quot; data-ke-align=&quot;alignCenter&quot; data-og-type=&quot;object&quot; data-og-title=&quot;GitHub - tldnjs1231/web-crawling&quot; data-og-description=&quot;Contribute to tldnjs1231/web-crawling development by creating an account on GitHub.&quot; data-og-host=&quot;github.com&quot; data-og-source-url=&quot;https://github.com/tldnjs1231/web-crawling/blob/main/web-crawling-06-selenium.ipynb&quot; data-og-url=&quot;https://github.com/tldnjs1231/web-crawling&quot; data-og-image=&quot;https://scrap.kakaocdn.net/dn/qgZhh/hyPayfzp67/Qdge1zjPIG8F2QvSCwTlX1/img.png?width=1200&amp;amp;height=600&amp;amp;face=0_0_1200_600&quot;&gt;&lt;a href=&quot;https://github.com/tldnjs1231/web-crawling/blob/main/web-crawling-06-selenium.ipynb&quot; target=&quot;_blank&quot; rel=&quot;noopener&quot; data-source-url=&quot;https://github.com/tldnjs1231/web-crawling/blob/main/web-crawling-06-selenium.ipynb&quot;&gt;
&lt;div class=&quot;og-image&quot; style=&quot;background-image: url('https://scrap.kakaocdn.net/dn/qgZhh/hyPayfzp67/Qdge1zjPIG8F2QvSCwTlX1/img.png?width=1200&amp;amp;height=600&amp;amp;face=0_0_1200_600');&quot;&gt;&amp;nbsp;&lt;/div&gt;
&lt;div class=&quot;og-text&quot;&gt;
&lt;p class=&quot;og-title&quot; data-ke-size=&quot;size16&quot;&gt;GitHub - tldnjs1231/web-crawling&lt;/p&gt;
&lt;p class=&quot;og-desc&quot; data-ke-size=&quot;size16&quot;&gt;Contribute to tldnjs1231/web-crawling development by creating an account on GitHub.&lt;/p&gt;
&lt;p class=&quot;og-host&quot; data-ke-size=&quot;size16&quot;&gt;github.com&lt;/p&gt;
&lt;/div&gt;
&lt;/a&gt;&lt;/figure&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;</description>
      <category>데이터 사이언스/웹 크롤링</category>
      <author>gool</author>
      <guid isPermaLink="true">https://cool-kim.tistory.com/63</guid>
      <comments>https://cool-kim.tistory.com/63#entry63comment</comments>
      <pubDate>Fri, 22 Jul 2022 14:26:36 +0900</pubDate>
    </item>
    <item>
      <title>05. 데이터의 형태(2): HTML</title>
      <link>https://cool-kim.tistory.com/62</link>
      <description>&lt;blockquote data-ke-size=&quot;size16&quot; data-ke-style=&quot;style1&quot;&gt;HTML 데이터는 태그명이 태그 속 정보를 대표하지 않아 크롤링이 까다롭다.&lt;/blockquote&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;XML은 데이터 저장 및 정보 전달을 주된 목적&lt;/b&gt;으로 하는 언어이다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;따라서 각 태그가 별다른 기능을 제공하지 않고,&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;태그의 이름이 각 태그 안의 정보를 대표하는 역할을 한다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;즉, &lt;b&gt;태그의 이름으로부터 태그 속 정보의 내용을 유추&lt;/b&gt;할 수 있다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;반면, &lt;b&gt;HTML은 클라이언트에게 보이는 화면을 디자인&lt;/b&gt;하기 위한 언어이다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;각 태그가 특정한 디자인 기능을 제공하는 것을 주된 목적으로 하기 때문에,&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;HTML 태그의 이름에는 각 태그가 제공하는 기능이 반영된다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;태그의 이름이 태그 속 정보의 내용과 무관&lt;/b&gt;하므로 XML 데이터에 비해 크롤링이 까다롭다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;이번 포스팅에서는 &lt;b&gt;HTML 형태의 데이터&lt;/b&gt;를 처리하는 방식에 대해 알아본다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;아래 링크에서 실시간 이슈 키워드와 베스트셀러 정보를 수집하는 실습 코드를 확인할 수 있다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;a href=&quot;https://github.com/tldnjs1231/web-crawling/blob/main/web-crawling-05-html.ipynb&quot; target=&quot;_blank&quot; rel=&quot;noopener&quot;&gt;https://github.com/tldnjs1231/web-crawling/blob/main/web-crawling-05-html.ipynb&lt;/a&gt;&lt;/p&gt;
&lt;figure id=&quot;og_1658409160516&quot; contenteditable=&quot;false&quot; data-ke-type=&quot;opengraph&quot; data-ke-align=&quot;alignCenter&quot; data-og-type=&quot;object&quot; data-og-title=&quot;GitHub - tldnjs1231/web-crawling&quot; data-og-description=&quot;Contribute to tldnjs1231/web-crawling development by creating an account on GitHub.&quot; data-og-host=&quot;github.com&quot; data-og-source-url=&quot;https://github.com/tldnjs1231/web-crawling/blob/main/web-crawling-05-html.ipynb&quot; data-og-url=&quot;https://github.com/tldnjs1231/web-crawling&quot; data-og-image=&quot;https://scrap.kakaocdn.net/dn/02aaN/hyPapWJzar/7Ft1cPu9KOVg0asj4vEjq1/img.png?width=1200&amp;amp;height=600&amp;amp;face=0_0_1200_600&quot;&gt;&lt;a href=&quot;https://github.com/tldnjs1231/web-crawling/blob/main/web-crawling-05-html.ipynb&quot; target=&quot;_blank&quot; rel=&quot;noopener&quot; data-source-url=&quot;https://github.com/tldnjs1231/web-crawling/blob/main/web-crawling-05-html.ipynb&quot;&gt;
&lt;div class=&quot;og-image&quot; style=&quot;background-image: url('https://scrap.kakaocdn.net/dn/02aaN/hyPapWJzar/7Ft1cPu9KOVg0asj4vEjq1/img.png?width=1200&amp;amp;height=600&amp;amp;face=0_0_1200_600');&quot;&gt;&amp;nbsp;&lt;/div&gt;
&lt;div class=&quot;og-text&quot;&gt;
&lt;p class=&quot;og-title&quot; data-ke-size=&quot;size16&quot;&gt;GitHub - tldnjs1231/web-crawling&lt;/p&gt;
&lt;p class=&quot;og-desc&quot; data-ke-size=&quot;size16&quot;&gt;Contribute to tldnjs1231/web-crawling development by creating an account on GitHub.&lt;/p&gt;
&lt;p class=&quot;og-host&quot; data-ke-size=&quot;size16&quot;&gt;github.com&lt;/p&gt;
&lt;/div&gt;
&lt;/a&gt;&lt;/figure&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;</description>
      <category>데이터 사이언스/웹 크롤링</category>
      <author>gool</author>
      <guid isPermaLink="true">https://cool-kim.tistory.com/62</guid>
      <comments>https://cool-kim.tistory.com/62#entry62comment</comments>
      <pubDate>Thu, 21 Jul 2022 22:13:08 +0900</pubDate>
    </item>
    <item>
      <title>04. 데이터의 형태(1): XML</title>
      <link>https://cool-kim.tistory.com/61</link>
      <description>&lt;blockquote data-ke-size=&quot;size16&quot; data-ke-style=&quot;style1&quot;&gt;웹 상에서 가져올 수 있는 데이터의 형태에는 JSON, HTML, XML이 있다.&lt;/blockquote&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;(1) JSON(JavaScript Object Notation)&lt;/b&gt;&lt;/p&gt;
&lt;ul style=&quot;list-style-type: disc;&quot; data-ke-list-type=&quot;disc&quot;&gt;
&lt;li&gt;JSON 표현식: 용량이 적고 사람과 기계가 모두 이해하기 쉬운 형태(JSON의 목적)&lt;/li&gt;
&lt;li&gt;&lt;b&gt;{key:value} 쌍으로 이루어진 데이터&lt;/b&gt;를 저장하거나 전송할 때 사용하는 개방형 표준 포맷&lt;/li&gt;
&lt;li&gt;Javascript에서 객체 생성 시 사용하는 표현식&lt;/li&gt;
&lt;/ul&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;(2) HTML(Hypertext Markup Language)&lt;/b&gt;&lt;/p&gt;
&lt;ul style=&quot;list-style-type: disc;&quot; data-ke-list-type=&quot;disc&quot;&gt;
&lt;li&gt;웹사이트에서 화면에 표시되는 정보를 약속한 것&lt;/li&gt;
&lt;li&gt;Hypertext: 단순 텍스트 이상의 텍스트(링크 등의 개념이 포함된 텍스트)&lt;/li&gt;
&lt;li&gt;Markup: 꺾쇠(&amp;lt;, &amp;gt;)로 이루어진 태그를 사용하는 규격&lt;/li&gt;
&lt;/ul&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;(3) XML(eXtensible Markup Language)&lt;/b&gt;&lt;/p&gt;
&lt;ul style=&quot;list-style-type: disc;&quot; data-ke-list-type=&quot;disc&quot;&gt;
&lt;li&gt;HTML의 한계를 극복할 목적으로 만들어진 다목적 마크업 언어&lt;/li&gt;
&lt;li&gt;XML은 HTML처럼 데이터를 보여주는 목적이 아닌, &lt;b&gt;데이터를 저장하고 전달할 목적&lt;/b&gt;으로 사용&lt;/li&gt;
&lt;li&gt;XML 태그는 HTML 태그처럼 정해져 있지 않고, 사용자가 직접 정의 가능&lt;/li&gt;
&lt;/ul&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;데이터의 세 가지 형태 중 JSON과 XML은 Python에서 &lt;b&gt;딕셔너리 형태로 사용이 가능&lt;/b&gt;하다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;반면, HTML은 딕셔너리 자료를 다루는 방식이 아닌 다른 방식으로 사용해야만 한다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;앞서 requests 모듈 실습에서 JSON 데이터를 &lt;b&gt;딕셔너리와 리스트의 형태로 변환&lt;/b&gt;하는&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;json()&lt;/b&gt;을&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;이미 살펴보았으므로 JSON 데이터 처리 방식은 넘어가도록 한다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;이번 포스팅에서는 &lt;b&gt;XML 형태의 데이터&lt;/b&gt;를 처리하는 방식에 대해 알아본다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;XML 데이터는 &lt;b&gt;딕셔너리 형태와 다른 형태로 모두 처리가 가능&lt;/b&gt;하다는 특징이 있다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;아래 링크에서 XML 형태의 날씨 정보와 뉴스 기사 정보를 수집하는 실습 코드를 확인할 수 있다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;a href=&quot;https://github.com/tldnjs1231/web-crawling/blob/main/web-crawling-04-xml.ipynb&quot; target=&quot;_blank&quot; rel=&quot;noopener&quot;&gt;https://github.com/tldnjs1231/web-crawling/blob/main/web-crawling-04-xml.ipynb&lt;/a&gt;&lt;/p&gt;
&lt;figure id=&quot;og_1658406516324&quot; contenteditable=&quot;false&quot; data-ke-type=&quot;opengraph&quot; data-ke-align=&quot;alignCenter&quot; data-og-type=&quot;object&quot; data-og-title=&quot;GitHub - tldnjs1231/web-crawling&quot; data-og-description=&quot;Contribute to tldnjs1231/web-crawling development by creating an account on GitHub.&quot; data-og-host=&quot;github.com&quot; data-og-source-url=&quot;https://github.com/tldnjs1231/web-crawling/blob/main/web-crawling-04-xml.ipynb&quot; data-og-url=&quot;https://github.com/tldnjs1231/web-crawling&quot; data-og-image=&quot;https://scrap.kakaocdn.net/dn/lXlH5/hyPatEJ0Ol/1z8TYGqP6weoKKOdPZ7TBK/img.png?width=1200&amp;amp;height=600&amp;amp;face=0_0_1200_600&quot;&gt;&lt;a href=&quot;https://github.com/tldnjs1231/web-crawling/blob/main/web-crawling-04-xml.ipynb&quot; target=&quot;_blank&quot; rel=&quot;noopener&quot; data-source-url=&quot;https://github.com/tldnjs1231/web-crawling/blob/main/web-crawling-04-xml.ipynb&quot;&gt;
&lt;div class=&quot;og-image&quot; style=&quot;background-image: url('https://scrap.kakaocdn.net/dn/lXlH5/hyPatEJ0Ol/1z8TYGqP6weoKKOdPZ7TBK/img.png?width=1200&amp;amp;height=600&amp;amp;face=0_0_1200_600');&quot;&gt;&amp;nbsp;&lt;/div&gt;
&lt;div class=&quot;og-text&quot;&gt;
&lt;p class=&quot;og-title&quot; data-ke-size=&quot;size16&quot;&gt;GitHub - tldnjs1231/web-crawling&lt;/p&gt;
&lt;p class=&quot;og-desc&quot; data-ke-size=&quot;size16&quot;&gt;Contribute to tldnjs1231/web-crawling development by creating an account on GitHub.&lt;/p&gt;
&lt;p class=&quot;og-host&quot; data-ke-size=&quot;size16&quot;&gt;github.com&lt;/p&gt;
&lt;/div&gt;
&lt;/a&gt;&lt;/figure&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;</description>
      <category>데이터 사이언스/웹 크롤링</category>
      <author>gool</author>
      <guid isPermaLink="true">https://cool-kim.tistory.com/61</guid>
      <comments>https://cool-kim.tistory.com/61#entry61comment</comments>
      <pubDate>Thu, 21 Jul 2022 15:22:40 +0900</pubDate>
    </item>
    <item>
      <title>03. 데이터 프레임(Data Frame)</title>
      <link>https://cool-kim.tistory.com/60</link>
      <description>&lt;blockquote data-ke-size=&quot;size16&quot; data-ke-style=&quot;style1&quot;&gt;데이터 프레임은 행과 열로 이루어진 테이블 형태의 자료구조를&lt;br /&gt;데이터 분석의 관점에서 부르는 말이다.&lt;/blockquote&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;데이터 프레임(dataframe)은 테이블 형태로 데이터를 처리할 수 있어 데이터 분석에 용이하다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;웹 크롤링의 목적은 웹 상의 데이터를 분석에 용이한 &lt;b&gt;dataframe 형태로 정리정돈&lt;/b&gt;하는 것이다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;앞선 포스팅에서 다룬 웹 페이지 요청 과정을 포함한 &lt;b&gt;기본적인 웹 크롤링 절차&lt;/b&gt;는 다음과 같다.&lt;/p&gt;
&lt;ul style=&quot;list-style-type: disc;&quot; data-ke-list-type=&quot;disc&quot;&gt;
&lt;li&gt;웹 페이지에 요청할 요청문 준비(requests)&lt;/li&gt;
&lt;li&gt;get 요청을 보내고 요청에 대한 응답 수용&lt;/li&gt;
&lt;li&gt;파싱(parsing): 응답의 종류(JSON, HTML, XML)에 따라 해석 도구(parser) 상이&lt;/li&gt;
&lt;li&gt;해석 방식에 따라 데이터를 처리하는 프로세스 상이&lt;/li&gt;
&lt;li&gt;프로세스에 맞게 데이터 정리정돈(dataframe 변환)&lt;/li&gt;
&lt;li&gt;정리/가공한 데이터를 파일로 저장&lt;/li&gt;
&lt;/ul&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;Python에서는 &lt;b&gt;list&lt;/b&gt;와 &lt;b&gt;dictionary&lt;/b&gt;로 데이터 프레임을 표현하며,&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;데이터 프레임 생성에는 &lt;b&gt;pandas 라이브러리&lt;/b&gt;가 사용된다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;Pandas를 활용하여 list와 dictionary로 표현된 자료를 dataframe 형태로 변환한다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;이번 포스팅에서는 크롤링한 데이터를 &lt;b&gt;dataframe으로 변환&lt;/b&gt; 또는 직접 dataframe을 생성한 뒤,&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;데이터를 가공하고 가공된 자료를 다시 CSV 파일로 저장해보는 실습을 진행한다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;이와 더불어 웹 크롤링에서 매우 중요한 &lt;b&gt;time 모듈&lt;/b&gt;을 사용해볼 예정이다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;실습 후 웹 상의 자료를 &lt;b&gt;list/dictionary 형태&lt;/b&gt;로 가져와 &lt;b&gt;dataframe으로 저장&lt;/b&gt;할 수 있어야 하며,&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;크롤링에서 time 모듈이 중요한 이유를 이해하고 이를 수시로 적용하는 것에 익숙해져야 한다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;실습 코드는 아래 링크에서 확인할 수 있다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;a href=&quot;https://github.com/tldnjs1231/web-crawling/blob/main/web-crawling-03-data_frame.ipynb&quot; target=&quot;_blank&quot; rel=&quot;noopener&quot;&gt;https://github.com/tldnjs1231/web-crawling/blob/main/web-crawling-03-data_frame.ipynb&lt;/a&gt;&lt;/p&gt;
&lt;figure id=&quot;og_1658404215231&quot; contenteditable=&quot;false&quot; data-ke-type=&quot;opengraph&quot; data-ke-align=&quot;alignCenter&quot; data-og-type=&quot;object&quot; data-og-title=&quot;GitHub - tldnjs1231/web-crawling&quot; data-og-description=&quot;Contribute to tldnjs1231/web-crawling development by creating an account on GitHub.&quot; data-og-host=&quot;github.com&quot; data-og-source-url=&quot;https://github.com/tldnjs1231/web-crawling/blob/main/web-crawling-03-data_frame.ipynb&quot; data-og-url=&quot;https://github.com/tldnjs1231/web-crawling&quot; data-og-image=&quot;https://scrap.kakaocdn.net/dn/RaFDP/hyPaxG7edd/ny0boBYk7IDiLht2sNfswK/img.png?width=1200&amp;amp;height=600&amp;amp;face=0_0_1200_600&quot;&gt;&lt;a href=&quot;https://github.com/tldnjs1231/web-crawling/blob/main/web-crawling-03-data_frame.ipynb&quot; target=&quot;_blank&quot; rel=&quot;noopener&quot; data-source-url=&quot;https://github.com/tldnjs1231/web-crawling/blob/main/web-crawling-03-data_frame.ipynb&quot;&gt;
&lt;div class=&quot;og-image&quot; style=&quot;background-image: url('https://scrap.kakaocdn.net/dn/RaFDP/hyPaxG7edd/ny0boBYk7IDiLht2sNfswK/img.png?width=1200&amp;amp;height=600&amp;amp;face=0_0_1200_600');&quot;&gt;&amp;nbsp;&lt;/div&gt;
&lt;div class=&quot;og-text&quot;&gt;
&lt;p class=&quot;og-title&quot; data-ke-size=&quot;size16&quot;&gt;GitHub - tldnjs1231/web-crawling&lt;/p&gt;
&lt;p class=&quot;og-desc&quot; data-ke-size=&quot;size16&quot;&gt;Contribute to tldnjs1231/web-crawling development by creating an account on GitHub.&lt;/p&gt;
&lt;p class=&quot;og-host&quot; data-ke-size=&quot;size16&quot;&gt;github.com&lt;/p&gt;
&lt;/div&gt;
&lt;/a&gt;&lt;/figure&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;</description>
      <category>데이터 사이언스/웹 크롤링</category>
      <author>gool</author>
      <guid isPermaLink="true">https://cool-kim.tistory.com/60</guid>
      <comments>https://cool-kim.tistory.com/60#entry60comment</comments>
      <pubDate>Tue, 19 Jul 2022 18:07:23 +0900</pubDate>
    </item>
    <item>
      <title>20. 군집 분석: K-Means Clustering</title>
      <link>https://cool-kim.tistory.com/59</link>
      <description>&lt;blockquote data-ke-size=&quot;size16&quot; data-ke-style=&quot;style1&quot;&gt;군집 분석은 알고리즘이 스스로 라벨링 되지 않은 데이터를&lt;br /&gt;분류하고 유사한 데이터끼리 묶는 비지도 학습 방식이다.&lt;/blockquote&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;머신러닝에는 두 가지 모델 학습 방식이 존재한다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;사전에 제공된 정답(레이블) 데이터를 바탕으로 학습하는 &lt;b&gt;지도 학습&lt;/b&gt;(Supervised Learning)과,&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;정답이 없는 데이터의 패턴을 스스로 찾아 학습하는 &lt;b&gt;비지도 학습&lt;/b&gt;(Unsupervised Learning)이다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;지도 학습에는 분류(Classification)와 회귀(Regression) 모델이 있다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;이번 포스팅에서 다루게 될 &lt;b&gt;군집화(Clustering)는 비지도 학습에 해당&lt;/b&gt;한다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;blockquote data-ke-style=&quot;style2&quot;&gt;군집화(Clustering)&lt;/blockquote&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;군집 분석에서 동일한 군집에 소속된 관측치들은 서로 &lt;b&gt;유사할수록&lt;/b&gt;,&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;다른 집단에 소속된 관측지들은 &lt;b&gt;유사하지 않을수록&lt;/b&gt; 군집화 모델의 성능이 좋다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;군집/관측치의 유사도는 관측치들 간의 &lt;b&gt;거리나 상관계수&lt;/b&gt;로 정의한다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;군집 중심점(Centroid) 또는 밀도(Density)를 활용한 군집화 방식이 존재한다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;* 군집화 알고리즘&lt;/p&gt;
&lt;ul style=&quot;list-style-type: disc;&quot; data-ke-list-type=&quot;disc&quot;&gt;
&lt;li&gt;K-Means(Centroid-based)&lt;/li&gt;
&lt;li&gt;DBSCAN(Density-based)&lt;/li&gt;
&lt;/ul&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;다음과 같은 경우에 군집화를 적용할 수 있다.&lt;/p&gt;
&lt;ul style=&quot;list-style-type: disc;&quot; data-ke-list-type=&quot;disc&quot;&gt;
&lt;li&gt;유사한 추세를 나타내는 주식 종목 그룹화&lt;/li&gt;
&lt;li&gt;고객 행동 패턴, 브랜드, 마켓 세분화(Segmentation)&lt;/li&gt;
&lt;li&gt;이미지 검출&lt;/li&gt;
&lt;li&gt;이상 검출(Anomaly Detection)&lt;/li&gt;
&lt;/ul&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;blockquote data-ke-style=&quot;style2&quot;&gt;실루엣 계수(Silhouette Coefficient)&lt;/blockquote&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;실루엣 계수는 특정 데이터가 &lt;b&gt;같은 군집 내의 데이터와 얼마나 가깝게&lt;/b&gt; 위치해 있고,&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;다른 군집의 데이터와 얼마나 멀리&lt;/b&gt; 떨어져 있는지를 나타내는 지표이다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;0부터 1까지의 값&lt;/b&gt;을 가지며, 1에 가까울수록 Clustering의 품질이 좋다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;실루엣 계수가 0이면 의미 없는 군집화이고, 음수이면 잘못 분류된 것을 의미한다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p&gt;&lt;figure class=&quot;imageblock alignLeft&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-origin-width=&quot;366&quot; data-origin-height=&quot;90&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/Mkvw8/btrGRIxSRAL/SCxEWixhWatQk6EoKTgOSk/img.png&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/Mkvw8/btrGRIxSRAL/SCxEWixhWatQk6EoKTgOSk/img.png&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/Mkvw8/btrGRIxSRAL/SCxEWixhWatQk6EoKTgOSk/img.png&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FMkvw8%2FbtrGRIxSRAL%2FSCxEWixhWatQk6EoKTgOSk%2Fimg.png&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;200&quot; height=&quot;49&quot; data-origin-width=&quot;366&quot; data-origin-height=&quot;90&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;/p&gt;
&lt;ul style=&quot;list-style-type: disc;&quot; data-ke-list-type=&quot;disc&quot;&gt;
&lt;li&gt;a(i) : 개체 i와 같은 군집 내에 있는 모든 다른 개체들 사이의 평균 거리&lt;/li&gt;
&lt;li&gt;b(i) : 개체 i와 다른 군집에 있는 개체들 사이의 평균 거리 중 가장 작은 값&lt;/li&gt;
&lt;/ul&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p&gt;&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-origin-width=&quot;806&quot; data-origin-height=&quot;545&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/bpcFCc/btrGQpeYToa/qwkwZhvXHXxEFHJiAXZy20/img.png&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/bpcFCc/btrGQpeYToa/qwkwZhvXHXxEFHJiAXZy20/img.png&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/bpcFCc/btrGQpeYToa/qwkwZhvXHXxEFHJiAXZy20/img.png&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FbpcFCc%2FbtrGQpeYToa%2FqwkwZhvXHXxEFHJiAXZy20%2Fimg.png&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;500&quot; height=&quot;338&quot; data-origin-width=&quot;806&quot; data-origin-height=&quot;545&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;* &lt;b&gt;개별 객체&lt;/b&gt;의 실루엣 계수(Silhouette Coefficient) 산출&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p&gt;&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-origin-width=&quot;1309&quot; data-origin-height=&quot;858&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/IsOeF/btrGWa0S3Ma/FRjWSjEPQKpyMTrk3UHa81/img.png&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/IsOeF/btrGWa0S3Ma/FRjWSjEPQKpyMTrk3UHa81/img.png&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/IsOeF/btrGWa0S3Ma/FRjWSjEPQKpyMTrk3UHa81/img.png&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FIsOeF%2FbtrGWa0S3Ma%2FFRjWSjEPQKpyMTrk3UHa81%2Fimg.png&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;700&quot; height=&quot;459&quot; data-origin-width=&quot;1309&quot; data-origin-height=&quot;858&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;a(i) = 1.04&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;b(i) = min(10.4, 6.4) = 6.4&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p&gt;&lt;figure class=&quot;imageblock alignLeft&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-origin-width=&quot;212&quot; data-origin-height=&quot;47&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/bu77Qe/btrGQEbWpON/gafBgF70c33d308uftxFK1/img.png&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/bu77Qe/btrGQEbWpON/gafBgF70c33d308uftxFK1/img.png&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/bu77Qe/btrGQEbWpON/gafBgF70c33d308uftxFK1/img.png&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2Fbu77Qe%2FbtrGQEbWpON%2FgafBgF70c33d308uftxFK1%2Fimg.png&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;220&quot; height=&quot;49&quot; data-origin-width=&quot;212&quot; data-origin-height=&quot;47&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;전체 데이터에 대한 실루엣 지표는 모든 군집에 속한 &lt;b&gt;개별 객체의 실루엣 지표의 평균&lt;/b&gt;과 같다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p&gt;&lt;figure class=&quot;imageblock alignLeft&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-origin-width=&quot;81&quot; data-origin-height=&quot;57&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/bPBnj2/btrGRxJjSS7/3uCxZMBKnOWlRNweQXwft1/img.png&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/bPBnj2/btrGRxJjSS7/3uCxZMBKnOWlRNweQXwft1/img.png&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/bPBnj2/btrGRxJjSS7/3uCxZMBKnOWlRNweQXwft1/img.png&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FbPBnj2%2FbtrGRxJjSS7%2F3uCxZMBKnOWlRNweQXwft1%2Fimg.png&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;80&quot; height=&quot;56&quot; data-origin-width=&quot;81&quot; data-origin-height=&quot;57&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;실루엣 분석(Silhouette Analysis)에는 &lt;b&gt;sklearn.metrics&lt;/b&gt; 모듈이 사용된다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;* Method&lt;/p&gt;
&lt;ul style=&quot;list-style-type: disc;&quot; data-ke-list-type=&quot;disc&quot;&gt;
&lt;li&gt;silhouette_samples(X, labels) : 각 데이터 포인트의 실루엣 계수를 산출&lt;/li&gt;
&lt;li&gt;silhouette_score(X, labels) : 전체 데이터의 실루엣 지표를 반환(개별 지표의 평균)&lt;/li&gt;
&lt;/ul&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;blockquote data-ke-style=&quot;style2&quot;&gt;K-Means Clustering&lt;/blockquote&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;임의의 &lt;b&gt;군집 중심점&lt;/b&gt;(Centroid) 설정 후 가까운 점들을 모으는 Centroid-based 알고리즘으로,&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;자주 쓰이고 어렵지 않은 군집 분석 기법이다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;사전에 &lt;b&gt;군집의 수 K&lt;/b&gt;가 정해져야 알고리즘 실행이 가능하다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;각 개체는 가장 가까운 중심에 할당되며, &lt;b&gt;같은 중심에 할당된 개체들이 하나의 군집을 생성&lt;/b&gt;한다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p&gt;&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-origin-width=&quot;553&quot; data-origin-height=&quot;436&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/ctL8wj/btrGQa9rHaD/6tA77yGg88tgDKm7UzASQ1/img.png&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/ctL8wj/btrGQa9rHaD/6tA77yGg88tgDKm7UzASQ1/img.png&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/ctL8wj/btrGQa9rHaD/6tA77yGg88tgDKm7UzASQ1/img.png&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FctL8wj%2FbtrGQa9rHaD%2F6tA77yGg88tgDKm7UzASQ1%2Fimg.png&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;400&quot; height=&quot;315&quot; data-origin-width=&quot;553&quot; data-origin-height=&quot;436&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;* K-Means 알고리즘 동작 과정&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;초기 군집 중심점 설정은 랜덤으로 또는 알고리즘의 계산에 의해 이루어진다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p&gt;&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-origin-width=&quot;1566&quot; data-origin-height=&quot;820&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/cVIPZ9/btrGQ0y41eT/UHqWJrMJIYhIZiYtunisxK/img.png&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/cVIPZ9/btrGQ0y41eT/UHqWJrMJIYhIZiYtunisxK/img.png&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/cVIPZ9/btrGQ0y41eT/UHqWJrMJIYhIZiYtunisxK/img.png&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FcVIPZ9%2FbtrGQ0y41eT%2FUHqWJrMJIYhIZiYtunisxK%2Fimg.png&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;800&quot; height=&quot;419&quot; data-origin-width=&quot;1566&quot; data-origin-height=&quot;820&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;K-Means 알고리즘은 쉽고 간결하며 대용량 데이터에도 활용이 가능하다는 장점이 있어,&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;일반적으로 군집화 작업에서 가장 많이 사용된다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;다만 &lt;b&gt;초기에 몇 개의 군집 중심점을 설정&lt;/b&gt;해야 하는지에 대한 명확한 기준이 없고,&lt;br /&gt;거리 기반의 알고리즘이므로 &lt;b&gt;이상치(Outlier)에 취약&lt;/b&gt;하다는 단점이 있다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;K-Means Clustering에는 sklearn.cluster 모듈의 &lt;b&gt;KMeans API&lt;/b&gt;를 사용한다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;* Argument&lt;/p&gt;
&lt;ul style=&quot;list-style-type: disc;&quot; data-ke-list-type=&quot;disc&quot;&gt;
&lt;li&gt;n_cluster : 군집 중심점의 개수&lt;/li&gt;
&lt;li&gt;init : 초기에 군집 중심점의 좌표를 설정하는 방식(일반적으로 &lt;b&gt;K-Means++&lt;/b&gt; 방식 사용)&lt;/li&gt;
&lt;li&gt;max-iter : 최대 반복 횟수(도달 전에 모든 데이터의 중심 이동이 다 이루어지면 그대로 종료)&lt;/li&gt;
&lt;/ul&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;군집 중심점의 좌표는 랜덤으로 설정할 수도, 내부 알고리즘을 통해 설정할 수도 있다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;K-Means++&lt;/b&gt; 알고리즘은 &lt;b&gt;기존의 K-Means 알고리즘을 개선&lt;/b&gt;한 알고리즘이다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;K-Means 알고리즘은 모든 중심을 랜덤하게 위치시키므로 매번 결과가 달라질 수 있다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;또, 한 번에 K개의 중심을 랜덤하게 생성하여 중심 간 거리가 가까우면 성능이 저하될 수 있다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;K-Means++의 경우 K개의 중심을 한 번에 생성하지 않고 데이터 중 하나를 무작위로 선택하여,&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;해당 데이터 포인트를 첫 번째 중심으로 지정한 후 최대한 거리가 먼 곳에 다음 중심을 생성한다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;이 같은 방식으로 &lt;b&gt;한 번에 하나씩 K번에 걸쳐&lt;/b&gt; K개의 군집 중심점을 설정한다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;* Method&lt;/p&gt;
&lt;ul style=&quot;list-style-type: disc;&quot; data-ke-list-type=&quot;disc&quot;&gt;
&lt;li&gt;fit_transform(dataset) : 학습 데이터를 이용하여 모델 학습&lt;/li&gt;
&lt;/ul&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;* Attribute&lt;/p&gt;
&lt;ul style=&quot;list-style-type: disc;&quot; data-ke-list-type=&quot;disc&quot;&gt;
&lt;li&gt;labels_ : 각 데이터 포인트가 속한 군집 중심점 레이블&lt;/li&gt;
&lt;li&gt;cluster_centers_ : 각 군집 중심점의 좌표(shape=[군집 개수, feature 개수])&lt;/li&gt;
&lt;li&gt;inertia_ : 군집 내의 거리의 합(최적의 K를 찾을 때 사용)&lt;/li&gt;
&lt;/ul&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;군집 중심점의 &lt;b&gt;좌표 위치를 시각화&lt;/b&gt;할 때 cluster_centers_를 사용할 수 있다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;blockquote data-ke-style=&quot;style2&quot;&gt;최적의 K 결정&lt;/blockquote&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;앞서 말한 것처럼 K-Means Clustering은 군집 중심점의 개수를 모른다는 단점이 있다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;최적의 K&lt;/b&gt;(군집 중심점의 개수)를 찾는 것이 K-Means의 가장 중요한 단계이다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;이를 도와주는 방법으로 &lt;b&gt;Elbow method&lt;/b&gt;와 &lt;b&gt;Silhouette method&lt;/b&gt;가 있다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;(1) Elbow method&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;Total intra-cluster variation/Total within-cluster sum of squared distance(WCSS)가&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;최소가 되는 K를 찾는 방법으로, 군집 내의 거리의 합(변동성)을 나타내는 &lt;b&gt;inertia가 급감하는&lt;/b&gt;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;지점(Elbow)&lt;/b&gt;을 군집의 개수로 사용한다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;변동성이 급격히 떨어진다는 것은 유사한 데이터가 잘 묶였다는 것을 의미한다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;inertia 수치는 K-Means API의 &lt;b&gt;inertia_&lt;/b&gt; 속성으로 확인할 수 있다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p&gt;&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-origin-width=&quot;745&quot; data-origin-height=&quot;475&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/Rw65G/btrGZKOMUZG/GM9wrleQxrqIwiUdZkOXf0/img.png&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/Rw65G/btrGZKOMUZG/GM9wrleQxrqIwiUdZkOXf0/img.png&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/Rw65G/btrGZKOMUZG/GM9wrleQxrqIwiUdZkOXf0/img.png&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FRw65G%2FbtrGZKOMUZG%2FGM9wrleQxrqIwiUdZkOXf0%2Fimg.png&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;400&quot; height=&quot;255&quot; data-origin-width=&quot;745&quot; data-origin-height=&quot;475&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;(2) Silhouette method&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;Elbow method를 사용하다 보면 inertia 그래프가 꺾이는 지점이 여러 개인 경우가 존재하므로,&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;최적의 K를 결정할 때 어느 정도 감에 의존할 수 밖에 없다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;따라서 &lt;b&gt;실루엣 계수의 시각화&lt;/b&gt;를 통해 K를 최적화하는 Silhouette method를 함께 사용한다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;span&gt;실루엣 계수는&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;/span&gt;&lt;b&gt;군집 내의 거리와&amp;nbsp;군집 간의 거리를 모두 고려&lt;/b&gt;하기 때문에 참고하기 좋은 지표이다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p&gt;&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-origin-width=&quot;1461&quot; data-origin-height=&quot;482&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/bSnQsF/btrGS51vyGL/kPVGTN3qwPyttzp4a7n0Mk/img.png&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/bSnQsF/btrGS51vyGL/kPVGTN3qwPyttzp4a7n0Mk/img.png&quot; data-alt=&quot;군집 중심점의 개수(K)를 두 개(왼쪽)와 세 개(오른쪽)로 설정&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/bSnQsF/btrGS51vyGL/kPVGTN3qwPyttzp4a7n0Mk/img.png&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FbSnQsF%2FbtrGS51vyGL%2FkPVGTN3qwPyttzp4a7n0Mk%2Fimg.png&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;700&quot; height=&quot;231&quot; data-origin-width=&quot;1461&quot; data-origin-height=&quot;482&quot;/&gt;&lt;/span&gt;&lt;figcaption&gt;군집 중심점의 개수(K)를 두 개(왼쪽)와 세 개(오른쪽)로 설정&lt;/figcaption&gt;
&lt;/figure&gt;
&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;위의 그림에서 x축은 실루엣 계수, y축은 개별 군집과 이에 속하는 데이터이다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;빨간색 점선은 전체 데이터의 &lt;b&gt;평균 실루엣 계수&lt;/b&gt;를 나타낸다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;개별 군집(y축)의 높이&lt;/b&gt;를 통해 군집 내 데이터의 분포와 뭉쳐 있는 정도를 가늠할 수 있다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;군집 중심점의 개수를 두 개로 설정한 왼쪽의 경우 평균 실루엣 계수가 0.7보다 조금 작다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;1번 군집의 데이터는 평균보다 높은 실루엣 계수를 가지지만,&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;0번 군집의 경우 대부분의 데이터가 평균 이하의 실루엣 계수를 가지며, &lt;b&gt;그림이 넓게 분포&lt;/b&gt;한다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;1번 군집은 &lt;b&gt;내부 데이터가 뭉쳐있지만&lt;/b&gt; 0번 군집은 &lt;b&gt;내부 데이터 간 거리가 멀다&lt;/b&gt;고 해석할 수 있다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;오른쪽 그림은 군집 중심점의 개수를 세 개로 설정한 경우이다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;평균 실루엣 계수 값은 두 개의 군집 중심점을 생성했을 때보다 작다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;0번 군집의 경우 모두 평균보다 높은 실루엣 계수를 가지는 것으로 보아 &lt;b&gt;데이터가 잘 뭉쳐있지만&lt;/b&gt;,&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;1번과 2번 군집의 데이터는 대체로 &lt;b&gt;평균보다 낮은 실루엣 계수&lt;/b&gt;를 가진다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;이상치에 취약한 군집화 모델의 특성 상&amp;nbsp;&lt;span&gt;전체 데이터의 평균 실루엣 계수는&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;/span&gt;쉽게 변동할 수 있다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;따라서 &lt;b&gt;평균 실루엣 계수와 개별 군집의 높이를 복합적으로 따져서&lt;/b&gt;&amp;nbsp;최적의 K를 결정한다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;개별 군집 간 적당한 거리를 유지하면서 군집 내 데이터가 잘 뭉쳐있는 경우를 찾는다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;여기까지 군집 분석에 대한 설명을 마친다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;실루엣 분석(Silhouette Analysis)과 K-Means 군집화 실습 코드는 아래 링크에서 볼 수 있다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;a href=&quot;https://github.com/tldnjs1231/data-analytics/blob/main/data-analytics-20-k-means_clustering.ipynb&quot; target=&quot;_blank&quot; rel=&quot;noopener&quot;&gt;https://github.com/tldnjs1231/data-analytics/blob/main/data-analytics-20-k-means_clustering.ipynb&lt;/a&gt;&lt;/p&gt;
&lt;figure id=&quot;og_1665800559196&quot; contenteditable=&quot;false&quot; data-ke-type=&quot;opengraph&quot; data-ke-align=&quot;alignCenter&quot; data-og-type=&quot;object&quot; data-og-title=&quot;GitHub - tldnjs1231/data-analytics&quot; data-og-description=&quot;Contribute to tldnjs1231/data-analytics development by creating an account on GitHub.&quot; data-og-host=&quot;github.com&quot; data-og-source-url=&quot;https://github.com/tldnjs1231/data-analytics/blob/main/data-analytics-20-k-means_clustering.ipynb&quot; data-og-url=&quot;https://github.com/tldnjs1231/data-analytics&quot; data-og-image=&quot;https://scrap.kakaocdn.net/dn/cUO4nA/hyP9j9MLQq/nL2iFHF3XKg9HaceXpRib0/img.png?width=1200&amp;amp;height=600&amp;amp;face=0_0_1200_600&quot;&gt;&lt;a href=&quot;https://github.com/tldnjs1231/data-analytics/blob/main/data-analytics-20-k-means_clustering.ipynb&quot; target=&quot;_blank&quot; rel=&quot;noopener&quot; data-source-url=&quot;https://github.com/tldnjs1231/data-analytics/blob/main/data-analytics-20-k-means_clustering.ipynb&quot;&gt;
&lt;div class=&quot;og-image&quot; style=&quot;background-image: url('https://scrap.kakaocdn.net/dn/cUO4nA/hyP9j9MLQq/nL2iFHF3XKg9HaceXpRib0/img.png?width=1200&amp;amp;height=600&amp;amp;face=0_0_1200_600');&quot;&gt;&amp;nbsp;&lt;/div&gt;
&lt;div class=&quot;og-text&quot;&gt;
&lt;p class=&quot;og-title&quot; data-ke-size=&quot;size16&quot;&gt;GitHub - tldnjs1231/data-analytics&lt;/p&gt;
&lt;p class=&quot;og-desc&quot; data-ke-size=&quot;size16&quot;&gt;Contribute to tldnjs1231/data-analytics development by creating an account on GitHub.&lt;/p&gt;
&lt;p class=&quot;og-host&quot; data-ke-size=&quot;size16&quot;&gt;github.com&lt;/p&gt;
&lt;/div&gt;
&lt;/a&gt;&lt;/figure&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;</description>
      <category>데이터 사이언스/빅데이터 분석과 모델링</category>
      <author>gool</author>
      <guid isPermaLink="true">https://cool-kim.tistory.com/59</guid>
      <comments>https://cool-kim.tistory.com/59#entry59comment</comments>
      <pubDate>Sun, 10 Jul 2022 00:26:53 +0900</pubDate>
    </item>
    <item>
      <title>19. 차원 축소 및 주성분 분석(PCA)</title>
      <link>https://cool-kim.tistory.com/58</link>
      <description>&lt;blockquote data-ke-size=&quot;size16&quot; data-ke-style=&quot;style1&quot;&gt;차원을 축소하여 복잡도를 줄이면서 정보의 손실을 최소화할 수 있는 방법&lt;/blockquote&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;변수(feature) 간의 독립성이 만족되면 이론적으로는 변수가 많아질수록 모델의 성능이 향상된다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;하지만 실제로는 변수들이 독립적이지 않고, &lt;b&gt;변수의 수가 일정 수준 이상이 되면 성능이 저하&lt;/b&gt;된다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;머신러닝에서 &lt;b&gt;차원(Dimension)은 변수(feature)&lt;/b&gt; 하나하나를 의미한다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;독립변수가 한 개면 1차원, 두 개면 2차원, ... , n개면 n차원인 식이다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;blockquote data-ke-style=&quot;style2&quot;&gt;차원의 저주(Curse of Dimensionality)&lt;/blockquote&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;앞서 말했듯이 변수의 수, 즉 차원이 증가하면 모델의 복잡도가 높아지고 성능이 저하된다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;이처럼 차원이 커지면서 &lt;b&gt;한정된 관측치로 증가하는 차원의 패턴을 잘 설명하지 못하게&lt;/b&gt; 되고,&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;모델의 성능이 저하되는 현상을 &lt;b&gt;차원의 저주(Curse of Dimensionality)&lt;/b&gt;라고 한다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;차원이 커지면 데이터 간의 거리가 증가하여 &lt;b&gt;모델의 예측 신뢰도가 하락&lt;/b&gt;한다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;아래 그림에서 실데이터의 수는 다섯 개로 일정하지만, 차원의 수가 점점 늘어남에 따라&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;존재하는 데이터로 설명할 수 없는 빈 공간이 많아진다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p&gt;&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-origin-width=&quot;605&quot; data-origin-height=&quot;220&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/boQA6o/btrGzcNrtBi/XL2mTbVZ2pOGrfIB5LSuA0/img.png&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/boQA6o/btrGzcNrtBi/XL2mTbVZ2pOGrfIB5LSuA0/img.png&quot; data-alt=&quot;차원의 저주(Curse of Dimensionality)&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/boQA6o/btrGzcNrtBi/XL2mTbVZ2pOGrfIB5LSuA0/img.png&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FboQA6o%2FbtrGzcNrtBi%2FXL2mTbVZ2pOGrfIB5LSuA0%2Fimg.png&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;500&quot; height=&quot;182&quot; data-origin-width=&quot;605&quot; data-origin-height=&quot;220&quot;/&gt;&lt;/span&gt;&lt;figcaption&gt;차원의 저주(Curse of Dimensionality)&lt;/figcaption&gt;
&lt;/figure&gt;
&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;blockquote data-ke-style=&quot;style2&quot;&gt;차원 축소(Dimension Reduction)&lt;/blockquote&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;차원 축소는 &lt;b&gt;고차원 학습 데이터를 저차원 데이터(3차원 이하)로 변환&lt;/b&gt;하여 학습시간을 절약하고,&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;시각적으로 보다 쉽게 데이터의 패턴을 인지할 수 있도록 하는 것을 말한다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;차원 축소 과정에서 &lt;b&gt;정보의 손실을 최소화&lt;/b&gt;하고 &lt;b&gt;데이터의 설명력을 높게 유지&lt;/b&gt;하는 것이 중요하다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;대표적인 차원 축소 기법으로 &lt;b&gt;주성분 분석&lt;/b&gt;(PCA, Principal Component Analysis)이 있다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;규제 회귀 방식(Ridge, Lasso, ElasticNet)도 차원 축소법의 일종이다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;회귀 계수가 0에 가까운 컬럼을 삭제하는 Lasso 규제는 보다 직관적인 차원 축소 방식이다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;CNN(Convolutional Neural Network)은 &lt;span style=&quot;letter-spacing: 0px;&quot;&gt;인간의 시신경을 모방하여 만든 딥러닝 구조로,&lt;/span&gt;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;span style=&quot;letter-spacing: 0px;&quot;&gt;이미지 분석 시 정보 소실을 최소화하기 위해 활용하는 차원 축소 기법이다.&lt;/span&gt;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;span style=&quot;letter-spacing: 0px;&quot;&gt;딥러닝에 사용되는 기법이므로 참고만 하도록 한다.&lt;/span&gt;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;차원 축소는 크게 두 가지 방식으로 이루어지는데,&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;각각 &lt;b&gt;변수 선택(Feature Selection)&lt;/b&gt;과 &lt;b&gt;변수 추출(Feature Extraction)&lt;/b&gt;이다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;주성분 분석(PCA)은 변수 추출 방식을 사용한다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;blockquote data-ke-style=&quot;style2&quot;&gt;변수 선택(Feature Selection)&lt;/blockquote&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;변수 선택은 &lt;b&gt;데이터의 특징을 잘 나타내주는 변수(feature)만 선택&lt;/b&gt;하는 방식이다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;전처리 실습 과정에서 머신러닝에 필요 없는 feature를 drop한 것도 변수 선택에 해당한다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;최근에 등장한 머신러닝 모델(Boosting)의 경우 &lt;b&gt;중요 변수를 추출하는 알고리즘이 내장&lt;/b&gt;되어 있어&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;다른 모델들에 비해 우수한 성능을 보인다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;다음의 네 가지 변수 선택 방식이 존재한다.&lt;/p&gt;
&lt;ul style=&quot;list-style-type: disc;&quot; data-ke-list-type=&quot;disc&quot;&gt;
&lt;li&gt;전역 탐색(Exhausitve Search)&lt;/li&gt;
&lt;li&gt;전진 선택(Forward Selection)&lt;/li&gt;
&lt;li&gt;후진 제거(Back Elimination)&lt;/li&gt;
&lt;li&gt;단계 선택(Stepwise Selection)&lt;/li&gt;
&lt;/ul&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;* 현실적으로 가능한 방식은 &lt;b&gt;전진 선택, 후진 제거, 단계 선택&lt;/b&gt;의 세 가지 뿐이다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;(1) 전역 탐색(Exhaustive Search)&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;가능한 모든 경우를 시도&lt;/b&gt;하여 최적의 feature 조합을 찾는 방식으로,&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;할 수만 있다면 최고의 방법이지만 &lt;b&gt;현실적으로 불가능&lt;/b&gt;하다. (2**p - 1가지 경우의 수 발생)&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p&gt;&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-origin-width=&quot;599&quot; data-origin-height=&quot;423&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/bek56D/btrGIvFwex5/bVcDRpbfdi5NcCrrjAaI21/img.png&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/bek56D/btrGIvFwex5/bVcDRpbfdi5NcCrrjAaI21/img.png&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/bek56D/btrGIvFwex5/bVcDRpbfdi5NcCrrjAaI21/img.png&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2Fbek56D%2FbtrGIvFwex5%2FbVcDRpbfdi5NcCrrjAaI21%2Fimg.png&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;450&quot; height=&quot;318&quot; data-origin-width=&quot;599&quot; data-origin-height=&quot;423&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;(2) 전진 선택(Forward Selection)&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;설명변수가 하나도 없는 상태에서 시작하여 &lt;b&gt;유의미한 변수부터 차례로 모델에 추가&lt;/b&gt;하는 방식이다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p&gt;&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-origin-width=&quot;626&quot; data-origin-height=&quot;634&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/ce23ra/btrGE2YizUg/EtLZ6upvamlgmyaP4wRo0k/img.png&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/ce23ra/btrGE2YizUg/EtLZ6upvamlgmyaP4wRo0k/img.png&quot; data-alt=&quot;전진 선택법(Forward Selection)&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/ce23ra/btrGE2YizUg/EtLZ6upvamlgmyaP4wRo0k/img.png&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2Fce23ra%2FbtrGE2YizUg%2FEtLZ6upvamlgmyaP4wRo0k%2Fimg.png&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;350&quot; height=&quot;354&quot; data-origin-width=&quot;626&quot; data-origin-height=&quot;634&quot;/&gt;&lt;/span&gt;&lt;figcaption&gt;전진 선택법(Forward Selection)&lt;/figcaption&gt;
&lt;/figure&gt;
&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;학습 데이터에 x1부터 x5까지 다섯 개의 변수가 존재한다고 할 때,&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;다음과 같이 변수를 하나씩 추가하면서 각 경우의 결정계수(&lt;span style=&quot;color: #333333;&quot;&gt;R&amp;sup2;)를 비교해 최적의 조합을 찾는다.&lt;/span&gt;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;span style=&quot;color: #333333;&quot;&gt;3단계에서 다른 변수를 추가해도 모델의 성능이 나아지지 않으므로 &lt;/span&gt;&lt;span style=&quot;color: #333333;&quot;&gt;&lt;b&gt;최적의 조합은 x3, x4&lt;/b&gt;이다.&lt;/span&gt;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p&gt;&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-filename=&quot;edited_blob&quot; data-origin-width=&quot;618&quot; data-origin-height=&quot;257&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/uAJOp/btrGKwQLuTL/3UtKAS7ySYRAbT7fGQJXY1/img.png&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/uAJOp/btrGKwQLuTL/3UtKAS7ySYRAbT7fGQJXY1/img.png&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/uAJOp/btrGKwQLuTL/3UtKAS7ySYRAbT7fGQJXY1/img.png&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FuAJOp%2FbtrGKwQLuTL%2F3UtKAS7ySYRAbT7fGQJXY1%2Fimg.png&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;500&quot; height=&quot;208&quot; data-filename=&quot;edited_blob&quot; data-origin-width=&quot;618&quot; data-origin-height=&quot;257&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p&gt;&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-filename=&quot;edited_blob&quot; data-origin-width=&quot;624&quot; data-origin-height=&quot;257&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/bARv06/btrGGxRqTTa/IshcxX0KDBHi3y7gmCNY6k/img.png&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/bARv06/btrGGxRqTTa/IshcxX0KDBHi3y7gmCNY6k/img.png&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/bARv06/btrGGxRqTTa/IshcxX0KDBHi3y7gmCNY6k/img.png&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FbARv06%2FbtrGGxRqTTa%2FIshcxX0KDBHi3y7gmCNY6k%2Fimg.png&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;500&quot; height=&quot;206&quot; data-filename=&quot;edited_blob&quot; data-origin-width=&quot;624&quot; data-origin-height=&quot;257&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-filename=&quot;edited_blob&quot; data-origin-width=&quot;587&quot; data-origin-height=&quot;206&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/zsvXR/btrGJFVdP3f/ZqKCQVOB5p3iYxQKM8DhBK/img.png&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/zsvXR/btrGJFVdP3f/ZqKCQVOB5p3iYxQKM8DhBK/img.png&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/zsvXR/btrGJFVdP3f/ZqKCQVOB5p3iYxQKM8DhBK/img.png&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FzsvXR%2FbtrGJFVdP3f%2FZqKCQVOB5p3iYxQKM8DhBK%2Fimg.png&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;480&quot; height=&quot;168&quot; data-filename=&quot;edited_blob&quot; data-origin-width=&quot;587&quot; data-origin-height=&quot;206&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;(3) 후진 제거(Back Elimination)&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;설명변수를 모두 포함한 상태에서 시작하여 &lt;b&gt;가장 적은 영향을 주는 변수부터 제거&lt;/b&gt;하는 방식이다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p&gt;&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-origin-width=&quot;670&quot; data-origin-height=&quot;616&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/d5133y/btrGE2YiFj8/8DkTHOFQKzTgj7xlhi2l5K/img.png&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/d5133y/btrGE2YiFj8/8DkTHOFQKzTgj7xlhi2l5K/img.png&quot; data-alt=&quot;후진 제거법(Back Elimination)&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/d5133y/btrGE2YiFj8/8DkTHOFQKzTgj7xlhi2l5K/img.png&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2Fd5133y%2FbtrGE2YiFj8%2F8DkTHOFQKzTgj7xlhi2l5K%2Fimg.png&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;350&quot; height=&quot;322&quot; data-origin-width=&quot;670&quot; data-origin-height=&quot;616&quot;/&gt;&lt;/span&gt;&lt;figcaption&gt;후진 제거법(Back Elimination)&lt;/figcaption&gt;
&lt;/figure&gt;
&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;마찬가지로 x1부터 x5까지 다섯 개의 변수가 존재하는 경우,&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;이번에는 아래 상태에서 출발하여 추가로 변수를 제거하는 것이 성능 저하를 가져올 때 멈춘다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p&gt;&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-origin-width=&quot;722&quot; data-origin-height=&quot;71&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/LpWjY/btrGHmIU7XF/KUzBRKJr8XJcRSG8rf9ZhK/img.png&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/LpWjY/btrGHmIU7XF/KUzBRKJr8XJcRSG8rf9ZhK/img.png&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/LpWjY/btrGHmIU7XF/KUzBRKJr8XJcRSG8rf9ZhK/img.png&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FLpWjY%2FbtrGHmIU7XF%2FKUzBRKJr8XJcRSG8rf9ZhK%2Fimg.png&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;600&quot; height=&quot;59&quot; data-origin-width=&quot;722&quot; data-origin-height=&quot;71&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;(4) 단계 선택(Stepwise Selection)&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;변수가 하나도 없는 상태에서 시작하여 &lt;b&gt;전진 선택과 후진 제거를 번갈아가며 수행&lt;/b&gt;하는 방식이다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;이후 과정에서 한 번 &lt;b&gt;선택된 변수가 제거되거나 제거된 변수가 재선택&lt;/b&gt; 될 수 있다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;전진 선택법과 후진 제거법에 비해 시간이 많이 소요되지만 좋은 성능을 보인다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p&gt;&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-origin-width=&quot;862&quot; data-origin-height=&quot;540&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/vHIAe/btrGKxvrEfu/CEYOONzZMa0I6gcVKz4j70/img.png&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/vHIAe/btrGKxvrEfu/CEYOONzZMa0I6gcVKz4j70/img.png&quot; data-alt=&quot;4가지 변수 선택법 성능 및 소요시간 비교&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/vHIAe/btrGKxvrEfu/CEYOONzZMa0I6gcVKz4j70/img.png&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FvHIAe%2FbtrGKxvrEfu%2FCEYOONzZMa0I6gcVKz4j70%2Fimg.png&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;500&quot; height=&quot;313&quot; data-origin-width=&quot;862&quot; data-origin-height=&quot;540&quot;/&gt;&lt;/span&gt;&lt;figcaption&gt;4가지 변수 선택법 성능 및 소요시간 비교&lt;/figcaption&gt;
&lt;/figure&gt;
&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;blockquote data-ke-style=&quot;style2&quot;&gt;변수 추출(Feature Extraction)&lt;/blockquote&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;기존의 데이터를 주요 성분/feature를 포함한 저차원 데이터로 압축하여 추출하는 방식으로,&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;원본 데이터의 고차원 feature 공간을 저차원의 &lt;b&gt;새로운 feature 공간&lt;/b&gt;으로 투영한다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;기존 feature를 조합하여&amp;nbsp;&lt;b&gt;데이터를 잘 설명할 수 있는 &lt;/b&gt;&lt;b&gt;새로운 feature&lt;/b&gt;를 추출해낸다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p&gt;&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-origin-width=&quot;722&quot; data-origin-height=&quot;240&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/ceYXLg/btrGKTeeTKm/spZzmJa65pphaQZQsivZT0/img.png&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/ceYXLg/btrGKTeeTKm/spZzmJa65pphaQZQsivZT0/img.png&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/ceYXLg/btrGKTeeTKm/spZzmJa65pphaQZQsivZT0/img.png&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FceYXLg%2FbtrGKTeeTKm%2FspZzmJa65pphaQZQsivZT0%2Fimg.png&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;450&quot; height=&quot;150&quot; data-origin-width=&quot;722&quot; data-origin-height=&quot;240&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;변수 추출 모델은 내장된 알고리즘으로 &lt;b&gt;높은 설명력의&amp;nbsp;상관관계를 가진 feature&lt;/b&gt;들을 찾아,&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;&lt;span&gt;선별된 변수를 대상으로&lt;/span&gt;&lt;/b&gt;&lt;span&gt;&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;/span&gt;분산 값을 구하는 등 모델 내부에서 &lt;b&gt;추가적인 계산을 수행&lt;/b&gt;한다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;이 같은 방식으로 고차원 데이터의 변동을 최대한 보존하면서 저차원 데이터로 변환한다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;변수 추출 시 내부적인 계산을 통해 중요한 성분을 포함한 새로운 feature를 추출하기 때문에,&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;추출된 feature는 &lt;b&gt;기존 데이터의 feature와 다른 값&lt;/b&gt;을 가지게 된다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;b&gt;주성분 분석(PCA)&lt;/b&gt;이 대표적인 변수 추출 방식의 차원 축소 기법이다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;blockquote data-ke-style=&quot;style2&quot;&gt;주성분 분석(PCA, Principal Component Analysis)&lt;/blockquote&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;주성분 분석은 차원 축소를 위해 &lt;b&gt;분포의 주성분을 분석&lt;/b&gt;하는 기법으로,&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;변수가 너무 많아 기존 feature를 조합한 새로운 변수로 모델을 생성할 때 사용된다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;(* 주성분: 전체 feature의 &lt;b&gt;분산/변동을 가장 잘 설명하는 성분&lt;/b&gt;)&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;주성분 분석의 목적은 &lt;b&gt;데이터를 충분히 잘 설명할 수 있는 새로운 축&lt;/b&gt;을 찾아내는 것이다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;예를 들어 x1, x2 두 개의 축을 가진 2차원 데이터를 1차원으로 축소하려고 한다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;가장 쉬운 방법은 둘 중 하나의 축으로 모든 데이터를 이동시키는 것이다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;하지만 그렇게 할 경우, 아래 그림의 x1, x2 축과 같이 &lt;b&gt;겹치는 데이터 생겨 정보가 소실&lt;/b&gt;된다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p&gt;&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-origin-width=&quot;615&quot; data-origin-height=&quot;384&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/di5G8v/btrGTwwxAWj/EifeCA2uQZY67Sni7bJSW0/img.png&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/di5G8v/btrGTwwxAWj/EifeCA2uQZY67Sni7bJSW0/img.png&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/di5G8v/btrGTwwxAWj/EifeCA2uQZY67Sni7bJSW0/img.png&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2Fdi5G8v%2FbtrGTwwxAWj%2FEifeCA2uQZY67Sni7bJSW0%2Fimg.png&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;500&quot; height=&quot;312&quot; data-origin-width=&quot;615&quot; data-origin-height=&quot;384&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;차원을 축소하면서 발생하는 정보의 유실을 최소화하기 위해서는 &lt;b&gt;분산을 최대한 보존&lt;/b&gt;해야 한다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;분산이 커지면 점들 사이의 거리가 유지되어 데이터가 서로 겹치지 않고,&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;데이터가 겹치지 않으면 &lt;b&gt;변수들 간의 차이가 명확하여 효율적인 분석&lt;/b&gt;이 가능하다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;다음은 데이터가 퍼져 있는 정도인 분산이 최대가 되는 축을 찾아나가는 과정이다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p&gt;&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-origin-width=&quot;1000&quot; data-origin-height=&quot;400&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/PRguK/btrGRi0i704/LyB8De7HxFsDCv3DXaLkM0/img.gif&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/PRguK/btrGRi0i704/LyB8De7HxFsDCv3DXaLkM0/img.gif&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/PRguK/btrGRi0i704/LyB8De7HxFsDCv3DXaLkM0/img.gif&quot; srcset=&quot;https://blog.kakaocdn.net/dn/PRguK/btrGRi0i704/LyB8De7HxFsDCv3DXaLkM0/img.gif&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;860&quot; height=&quot;344&quot; data-origin-width=&quot;1000&quot; data-origin-height=&quot;400&quot;/&gt;&lt;/span&gt;&lt;/figure&gt;
&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;주성분 분석(PCA)에서는 학습 데이터의 분산이 최대인 축을 먼저 찾고,&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;이 &lt;b&gt;첫 번째 축과 직교&lt;/b&gt;하면서 &lt;b&gt;분산을 최대한 보존&lt;/b&gt;하는 두 번째 축을 찾는다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p&gt;&lt;figure class=&quot;imageblock alignCenter&quot; data-ke-mobileStyle=&quot;widthOrigin&quot; data-origin-width=&quot;1146&quot; data-origin-height=&quot;515&quot;&gt;&lt;span data-url=&quot;https://blog.kakaocdn.net/dn/snBra/btrGRj5DSmO/ckgcoFkIGJiDySbbcLiL3K/img.png&quot; data-phocus=&quot;https://blog.kakaocdn.net/dn/snBra/btrGRj5DSmO/ckgcoFkIGJiDySbbcLiL3K/img.png&quot; data-alt=&quot;첫 번째 축(빨간색)과 두 번째 축(파란색)&quot;&gt;&lt;img src=&quot;https://blog.kakaocdn.net/dn/snBra/btrGRj5DSmO/ckgcoFkIGJiDySbbcLiL3K/img.png&quot; srcset=&quot;https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&amp;fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FsnBra%2FbtrGRj5DSmO%2FckgcoFkIGJiDySbbcLiL3K%2Fimg.png&quot; onerror=&quot;this.onerror=null; this.src='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png'; this.srcset='//t1.daumcdn.net/tistory_admin/static/images/no-image-v1.png';&quot; loading=&quot;lazy&quot; width=&quot;600&quot; height=&quot;270&quot; data-origin-width=&quot;1146&quot; data-origin-height=&quot;515&quot;/&gt;&lt;/span&gt;&lt;figcaption&gt;첫 번째 축(빨간색)과 두 번째 축(파란색)&lt;/figcaption&gt;
&lt;/figure&gt;
&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;3차원 이상의 데이터라면 처음 두 개의 축에 직교하고 분산이 최대인 세 번째 축을 찾을 수 있다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;PCA 알고리즘은 &lt;b&gt;데이터의 차원(feature)의 개수만큼&lt;/b&gt; 여러 방향의 직교하는 축을 찾는다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;이처럼 i번째 축을 정의하는 단위 벡터를 &lt;b&gt;i번째 주성분&lt;/b&gt;(PC, Principal Component)이라고 한다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;축이 결정되면 모델 내부에서 축을 기준으로 데이터의 값을 변환하는 작업을 거친다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;PCA를 구현하기 위한 API는 sklearn.decomposition에 포함되어 있다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;* Argument&lt;/p&gt;
&lt;ul style=&quot;list-style-type: disc;&quot; data-ke-list-type=&quot;disc&quot;&gt;
&lt;li&gt;&lt;span style=&quot;color: #333333;&quot;&gt;n_components&lt;/span&gt; : PCA로 변환할 차원의 수&lt;/li&gt;
&lt;/ul&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;* Method&lt;/p&gt;
&lt;ul style=&quot;list-style-type: disc;&quot; data-ke-list-type=&quot;disc&quot;&gt;
&lt;li&gt;fit_transform(x) : x에 차원 축소를 적용&lt;/li&gt;
&lt;li&gt;inverse_transform(x) : 데이터를 원래의 차원 공간으로 변환&lt;/li&gt;
&lt;/ul&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;* Attribute(적절한 &lt;b&gt;차원의 수 결정&lt;/b&gt;에 사용)&lt;/p&gt;
&lt;ul style=&quot;list-style-type: disc;&quot; data-ke-list-type=&quot;disc&quot;&gt;
&lt;li&gt;explained_variance_ : component 별 변동성(분산)&lt;/li&gt;
&lt;li&gt;explained_variance_ratio : component 별 변동성 비율&lt;/li&gt;
&lt;/ul&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;변동성 비율은 &lt;b&gt;축소된 차원(축)이 원본 데이터의 변동성을 얼마나 반영&lt;/b&gt;하는지 나타내는 비율이다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;각 component가 전체 데이터에 대해 어느 정도의 설명력을 가지는지 알 수 있다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;차원 축소 시 적절한 &lt;b&gt;차원의 수를 결정&lt;/b&gt;하는 데에 사용된다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;차원 축소와 주성분 분석을 완전히 이해하기 위해서는,&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;공분산 행렬과 고유값(eigenvalue) 등 &lt;b&gt;수학적 지식이 뒷받침&lt;/b&gt;되어야 한다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;데이터 사이언스와 머신러닝을 깊이 있게 이해하기 위한 &lt;b&gt;수학 공부의 필요성&lt;/b&gt;을 느낀다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;아래 링크에서 주성분 분석(PCA) 실습 코드를 확인할 수 있다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;a href=&quot;https://github.com/tldnjs1231/data-analytics/blob/main/data-analytics-19-pca.ipynb&quot; target=&quot;_blank&quot; rel=&quot;noopener&quot;&gt;https://github.com/tldnjs1231/data-analytics/blob/main/data-analytics-19-pca.ipynb&lt;/a&gt;&lt;/p&gt;
&lt;figure id=&quot;og_1657538468402&quot; contenteditable=&quot;false&quot; data-ke-type=&quot;opengraph&quot; data-ke-align=&quot;alignCenter&quot; data-og-type=&quot;object&quot; data-og-title=&quot;GitHub - tldnjs1231/data-analytics&quot; data-og-description=&quot;Contribute to tldnjs1231/data-analytics development by creating an account on GitHub.&quot; data-og-host=&quot;github.com&quot; data-og-source-url=&quot;https://github.com/tldnjs1231/data-analytics/blob/main/data-analytics-19-pca.ipynb&quot; data-og-url=&quot;https://github.com/tldnjs1231/data-analytics&quot; data-og-image=&quot;https://scrap.kakaocdn.net/dn/bw1xYg/hyO3mF2qr4/iAUMGP854OVP9IUUcEuKak/img.png?width=1200&amp;amp;height=600&amp;amp;face=0_0_1200_600&quot;&gt;&lt;a href=&quot;https://github.com/tldnjs1231/data-analytics/blob/main/data-analytics-19-pca.ipynb&quot; target=&quot;_blank&quot; rel=&quot;noopener&quot; data-source-url=&quot;https://github.com/tldnjs1231/data-analytics/blob/main/data-analytics-19-pca.ipynb&quot;&gt;
&lt;div class=&quot;og-image&quot; style=&quot;background-image: url('https://scrap.kakaocdn.net/dn/bw1xYg/hyO3mF2qr4/iAUMGP854OVP9IUUcEuKak/img.png?width=1200&amp;amp;height=600&amp;amp;face=0_0_1200_600');&quot;&gt;&amp;nbsp;&lt;/div&gt;
&lt;div class=&quot;og-text&quot;&gt;
&lt;p class=&quot;og-title&quot; data-ke-size=&quot;size16&quot;&gt;GitHub - tldnjs1231/data-analytics&lt;/p&gt;
&lt;p class=&quot;og-desc&quot; data-ke-size=&quot;size16&quot;&gt;Contribute to tldnjs1231/data-analytics development by creating an account on GitHub.&lt;/p&gt;
&lt;p class=&quot;og-host&quot; data-ke-size=&quot;size16&quot;&gt;github.com&lt;/p&gt;
&lt;/div&gt;
&lt;/a&gt;&lt;/figure&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;</description>
      <category>데이터 사이언스/빅데이터 분석과 모델링</category>
      <author>gool</author>
      <guid isPermaLink="true">https://cool-kim.tistory.com/58</guid>
      <comments>https://cool-kim.tistory.com/58#entry58comment</comments>
      <pubDate>Tue, 5 Jul 2022 22:13:17 +0900</pubDate>
    </item>
    <item>
      <title>18. Kaggle 프로젝트: 호텔 예약</title>
      <link>https://cool-kim.tistory.com/57</link>
      <description>&lt;blockquote data-ke-size=&quot;size16&quot; data-ke-style=&quot;style1&quot;&gt;호텔 예약 취소 여부(binary data)를 예측하는 모델링&lt;/blockquote&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;머신러닝 플랫폼&lt;span&gt;&amp;nbsp;&lt;/span&gt;&lt;b&gt;캐글(Kaggle)&lt;/b&gt;의 호텔 예약 관련 데이터를 활용하여,&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;데이터 전처리 및 시각화와 로지스틱 회귀, 랜덤 포레스트, XGBoost 등의 모델을 실습한다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;프로젝트 실습 코드는 아래 링크에서 확인할 수 있다.&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&lt;a href=&quot;https://github.com/tldnjs1231/data-analytics/blob/main/data-analytics-18-hotel_booking.ipynb&quot; target=&quot;_blank&quot; rel=&quot;noopener&quot;&gt;https://github.com/tldnjs1231/data-analytics/blob/main/data-analytics-18-hotel_booking.ipynb&lt;/a&gt;&lt;/p&gt;
&lt;figure id=&quot;og_1667813526029&quot; contenteditable=&quot;false&quot; data-ke-type=&quot;opengraph&quot; data-ke-align=&quot;alignCenter&quot; data-og-type=&quot;object&quot; data-og-title=&quot;GitHub - tldnjs1231/data-analytics&quot; data-og-description=&quot;Contribute to tldnjs1231/data-analytics development by creating an account on GitHub.&quot; data-og-host=&quot;github.com&quot; data-og-source-url=&quot;https://github.com/tldnjs1231/data-analytics/blob/main/data-analytics-18-hotel_booking.ipynb&quot; data-og-url=&quot;https://github.com/tldnjs1231/data-analytics&quot; data-og-image=&quot;https://scrap.kakaocdn.net/dn/pUBka/hyQtva8OKw/F6hsGXI5Spkk1r66j3SNu0/img.png?width=1200&amp;amp;height=600&amp;amp;face=0_0_1200_600&quot;&gt;&lt;a href=&quot;https://github.com/tldnjs1231/data-analytics/blob/main/data-analytics-18-hotel_booking.ipynb&quot; target=&quot;_blank&quot; rel=&quot;noopener&quot; data-source-url=&quot;https://github.com/tldnjs1231/data-analytics/blob/main/data-analytics-18-hotel_booking.ipynb&quot;&gt;
&lt;div class=&quot;og-image&quot; style=&quot;background-image: url('https://scrap.kakaocdn.net/dn/pUBka/hyQtva8OKw/F6hsGXI5Spkk1r66j3SNu0/img.png?width=1200&amp;amp;height=600&amp;amp;face=0_0_1200_600');&quot;&gt;&amp;nbsp;&lt;/div&gt;
&lt;div class=&quot;og-text&quot;&gt;
&lt;p class=&quot;og-title&quot; data-ke-size=&quot;size16&quot;&gt;GitHub - tldnjs1231/data-analytics&lt;/p&gt;
&lt;p class=&quot;og-desc&quot; data-ke-size=&quot;size16&quot;&gt;Contribute to tldnjs1231/data-analytics development by creating an account on GitHub.&lt;/p&gt;
&lt;p class=&quot;og-host&quot; data-ke-size=&quot;size16&quot;&gt;github.com&lt;/p&gt;
&lt;/div&gt;
&lt;/a&gt;&lt;/figure&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;
&lt;p data-ke-size=&quot;size16&quot;&gt;&amp;nbsp;&lt;/p&gt;</description>
      <category>데이터 사이언스/빅데이터 분석과 모델링</category>
      <author>gool</author>
      <guid isPermaLink="true">https://cool-kim.tistory.com/57</guid>
      <comments>https://cool-kim.tistory.com/57#entry57comment</comments>
      <pubDate>Tue, 5 Jul 2022 22:12:27 +0900</pubDate>
    </item>
  </channel>
</rss>