diff --git a/.gitignore b/.gitignore index 9334f368..19c937b7 100644 --- a/.gitignore +++ b/.gitignore @@ -164,6 +164,7 @@ openai_api/data/fine_food_reviews_with_embeddings_1k_2146.csv # files of Vector Stores, e.g. Chroma *.pkl *.bin +*.faiss langchain/openai-translator/flagged/* langchain/openai-translator/flask_temps/* \ No newline at end of file diff --git a/langchain/jupyter/homework/panxizhi/AutoGPTWebUI/main.py b/langchain/jupyter/homework/panxizhi/AutoGPTWebUI/main.py new file mode 100644 index 00000000..c2603766 --- /dev/null +++ b/langchain/jupyter/homework/panxizhi/AutoGPTWebUI/main.py @@ -0,0 +1,33 @@ +import gradio as gr +from run_autogpt import LCAutoGPTRunner + +def createAutoGPTUI(): + + with gr.Blocks() as demo: + #create a text box for user input , label is "Your task:" + #create a button to run the model, label is "Run" + #create a output box to display the output and download result file, label is "Output" + gr.Markdown( + """ + # Hello AutoGPT! + ### Type your task in the box below and click "Run" to run the model. + """) + with gr.Row() as row: + task = gr.Textbox(lines=3, label="Your task:", min_width=300) + run = gr.Button(label="Run") + output = gr.File(label="Output") + + def run_autoGPT(task): + #run the model and return the result + file = LCAutoGPTRunner().run(task) + + return file + + run.click(run_autoGPT, inputs=[task], outputs=[output]) + + demo.launch() + + + +if __name__ == "__main__": + createAutoGPTUI() \ No newline at end of file diff --git a/langchain/jupyter/homework/panxizhi/AutoGPTWebUI/requirements.txt b/langchain/jupyter/homework/panxizhi/AutoGPTWebUI/requirements.txt new file mode 100644 index 00000000..8af494a0 --- /dev/null +++ b/langchain/jupyter/homework/panxizhi/AutoGPTWebUI/requirements.txt @@ -0,0 +1,5 @@ +langchain +langchain_experimental +openai +gradio +google-search-results \ No newline at end of file diff --git a/langchain/jupyter/homework/panxizhi/AutoGPTWebUI/run_autogpt.py b/langchain/jupyter/homework/panxizhi/AutoGPTWebUI/run_autogpt.py new file mode 100644 index 00000000..db24dade --- /dev/null +++ b/langchain/jupyter/homework/panxizhi/AutoGPTWebUI/run_autogpt.py @@ -0,0 +1,58 @@ +import os +import uuid +import faiss + +from langchain.utilities import SerpAPIWrapper +from langchain.agents import Tool +from langchain.embeddings import OpenAIEmbeddings +from langchain.tools.file_management.write import WriteFileTool +from langchain.tools.file_management.read import ReadFileTool +from langchain.vectorstores import FAISS +from langchain.docstore import InMemoryDocstore +from langchain_experimental.autonomous_agents import AutoGPT +from langchain.chat_models import ChatOpenAI + +os.environ["SERPAPI_API_KEY"] = "25409e23ca5020b5264e40a5e3594f50a7ef8ee912a35f86e7ef0fde598dbd7c" + +class LCAutoGPTRunner: + agent = None + + def __init__(self): + # 构造 AutoGPT 的工具集 + search = SerpAPIWrapper() + tools = [ + Tool( + name="search", + func=search.run, + description="useful for when you need to answer questions about current events. You should ask targeted questions", + ), + WriteFileTool(), + ReadFileTool(), + ] + + embeddings_model = OpenAIEmbeddings() + + # OpenAI Embedding 向量维数 + embedding_size = 1536 + # 使用 Faiss 的 IndexFlatL2 索引 + index = faiss.IndexFlatL2(embedding_size) + # 实例化 Faiss 向量数据库 + vectorstore = FAISS(embeddings_model.embed_query, index, InMemoryDocstore({}), {}) + + agent = AutoGPT.from_llm_and_tools( + ai_name="Jarvis", + ai_role="Assistant", + tools=tools, + llm=ChatOpenAI(temperature=0), + memory=vectorstore.as_retriever(), # 实例化 Faiss 的 VectorStoreRetriever + ) + agent.chain.verbose = True + self.agent = agent + + def run(self, task): + #run the model and return the result + rand = uuid.uuid4() + tasks = [f'{task} Then write file to "result-{rand}.txt"'] + result = self.agent.run(tasks) + print(f'AutoGPT result: {result}') + return f"result-{rand}.txt" \ No newline at end of file diff --git a/langchain/jupyter/homework/panxizhi/openai-translator/LICENSE b/langchain/jupyter/homework/panxizhi/openai-translator/LICENSE new file mode 100644 index 00000000..f288702d --- /dev/null +++ b/langchain/jupyter/homework/panxizhi/openai-translator/LICENSE @@ -0,0 +1,674 @@ + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU General Public License is a free, copyleft license for +software and other kinds of works. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: + + Copyright (C) + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, your program's commands +might be different; for a GUI interface, you would use an "about box". + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU GPL, see +. + + The GNU General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, you +may consider it more useful to permit linking proprietary applications with +the library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. But first, please read +. diff --git a/langchain/jupyter/homework/panxizhi/openai-translator/README-CN.md b/langchain/jupyter/homework/panxizhi/openai-translator/README-CN.md new file mode 100644 index 00000000..b9fa6971 --- /dev/null +++ b/langchain/jupyter/homework/panxizhi/openai-translator/README-CN.md @@ -0,0 +1,93 @@ +# OpenAI-Translator + +

+
English | 中文 +

+

+ 所有的代码和文档完全由 OpenAI 的 GPT-4 模型生成 +

+ +## 介绍 + +OpenAI 翻译器是一个使用 AI 技术将英文 PDF 书籍翻译成中文的工具。这个工具使用了大型语言模型 (LLMs),如 ChatGLM 和 OpenAI 的 GPT-3 以及 GPT-3.5 Turbo 来进行翻译。它是用 Python 构建的,并且具有灵活、模块化和面向对象的设计。 + +## 为什么做这个项目 + +在现今的环境中,缺乏非商业而且有效的 PDF 翻译工具。很多用户有包含敏感数据的 PDF 文件,他们更倾向于不将其上传到公共商业服务网站,以保护隐私。这个项目就是为了解决这个问题,为需要翻译他们的 PDF 文件同时又要保护数据隐私的用户提供解决方案。 + +## 示例结果 + +OpenAI 翻译器目前还处于早期开发阶段,我正在积极地添加更多功能和改进其性能。我们非常欢迎任何反馈或贡献! + +![The_Old_Man_of_the_Sea](images/sample_image_0.png) + +

+ "老人与海" +

+ +## 特性 + +- [X] 使用大型语言模型 (LLMs) 将英文 PDF 书籍翻译成中文。 +- [X] 支持 ChatGLM 和 OpenAI 模型。 +- [X] 通过 YAML 文件或命令行参数灵活配置。 +- [X] 对健壮的翻译操作进行超时和错误处理。 +- [X] 模块化和面向对象的设计,易于定制和扩展。 +- [x] 添加对其他语言和翻译方向的支持。 +- [ ] 实现图形用户界面 (GUI) 以便更易于使用。 +- [ ] 创建一个网络服务或 API,以便在网络应用中使用。 +- [ ] 添加对多个 PDF 文件的批处理支持。 +- [ ] 添加对保留源 PDF 的原始布局和格式的支持。 +- [ ] 通过使用自定义训练的翻译模型来提高翻译质量。 + + +## 开始使用 + +### 环境准备 + +1.克隆仓库 `git clone git@github.com:DjangoPeng/openai-translator.git`。 + +2.OpenAI-翻译器 需要 Python 3.10 或更高版本。使用 `pip install -r requirements.txt` 安装依赖项。 + +3.设置您的 OpenAI API 密钥(`$OPENAI_API_KEY`)。您可以将其添加到环境变量中,或者在 config.yaml 文件中指定。 + +### 使用示例 + +您可以通过指定配置文件或提供命令行参数来使用 OpenAI-Translator 工具。 + +#### 使用配置文件 + +根据您的设置调整 `config.yaml` 文件: + +```yaml +model_name: "gpt-3.5-turbo" +input_file: "tests/test.pdf" +output_file_format: "markdown" +source_language: "English" +target_language: "Chinese" +``` + +然后命令行直接运行: + +```bash +python ai_translator/main.py +``` + +![sample_out](images/sample_image_1.png) + +#### 使用命令行参数 + +您也可以直接在命令行上指定设置。这是使用 OpenAI 模型的例子: + +```bash +# 将您的 api_key 设置为环境变量 +export OPENAI_API_KEY="sk-xxx" +python ai_translator/main.py --model_name "gpt-3.5-turbo" --input_file "your_input.pdf" --output_file_format "markdown" --source_language "English" --target_language "Chinese" +``` + +## 许可证 + +该项目采用 GPL-3.0 许可证。有关详细信息,请查看 [LICENSE](LICENSE) 文件。 + + + + diff --git a/langchain/jupyter/homework/panxizhi/openai-translator/README.md b/langchain/jupyter/homework/panxizhi/openai-translator/README.md new file mode 100644 index 00000000..db73ec4d --- /dev/null +++ b/langchain/jupyter/homework/panxizhi/openai-translator/README.md @@ -0,0 +1,90 @@ +# OpenAI-Translator + +

+
English | 中文 +

+

+ All the code and documentation are entirely generated by OpenAI's GPT-4 Model +

+ + +## Introduction + +OpenAI Translator is an AI-powered translation tool designed to translate English PDF books to Chinese. The tool leverages large language models (LLMs) like ChatGLM and OpenAI's GPT-3 and GPT-3.5 Turbo for translation. It's built in Python and has a flexible, modular, and object-oriented design. + +## Why this project + +In the current landscape, there's a lack of non-commercial yet efficient PDF translation tools. Many users have PDF documents with sensitive data that they prefer not to upload to public commercial service websites due to privacy concerns. This project was developed to address this gap, providing a solution for users who need to translate their PDFs while maintaining data privacy. + +### Sample Results + +The OpenAI Translator is still in its early stages of development, and I'm actively working on adding more features and improving its performance. We appreciate any feedback or contributions! + +![The_Old_Man_of_the_Sea](images/sample_image_0.png) + +

+ "The Old Man and the Sea" +

+ +## Features + +- [X] Translation of English PDF books to Chinese using LLMs. +- [X] Support for both [ChatGLM](https://github.com/THUDM/ChatGLM-6B) and [OpenAI](https://platform.openai.com/docs/models) models. +- [X] Flexible configuration through a YAML file or command-line arguments. +- [X] Timeouts and error handling for robust translation operations. +- [X] Modular and object-oriented design for easy customization and extension. +- [x] Add support for other languages and translation directions. +- [ ] Implement a graphical user interface (GUI) for easier use. +- [ ] Create a web service or API to enable usage in web applications. +- [ ] Add support for batch processing of multiple PDF files. +- [ ] Add support for preserving the original layout and formatting of the source PDF. +- [ ] Improve translation quality by using custom-trained translation models. + + +## Getting Started + +### Environment Setup + +1.Clone the repository `git clone git@github.com:DjangoPeng/openai-translator.git`. + +2.The `OpenAI-Translator` requires Python 3.10 or later. Install the dependencies with `pip install -r requirements.txt`. + +3.Set up your OpenAI API key(`$OPENAI_API_KEY`). You can either add it to your environment variables or specify it in the config.yaml file. + +### Usage + +You can use OpenAI-Translator either by specifying a configuration file or by providing command-line arguments. + +#### Using a configuration file: + +Adapt `config.yaml` file with your settings: + +```yaml +model_name: "gpt-3.5-turbo" +input_file: "tests/test.pdf" +output_file_format: "markdown" +source_language: "English" +target_language: "Chinese" +``` + +Then run the tool: + +```bash +python ai_translator/main.py +``` + +![sample_out](images/sample_image_1.png) + +#### Using command-line arguments: + +You can also specify the settings directly on the command line. Here's an example of how to use the OpenAI model: + +```bash +# Set your api_key as an env variable +export OPENAI_API_KEY="sk-xxx" +python ai_translator/main.py --model_name "gpt-3.5-turbo" --input_file "your_input.pdf" --output_file_format "markdown" --source_language "English" --target_language "Chinese" +``` + +## License + +This project is licensed under the GPL-3.0 License. See the [LICENSE](LICENSE) file for details. \ No newline at end of file diff --git a/langchain/jupyter/homework/panxizhi/openai-translator/ai_translator/__init__.py b/langchain/jupyter/homework/panxizhi/openai-translator/ai_translator/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/langchain/jupyter/homework/panxizhi/openai-translator/ai_translator/book/__init__.py b/langchain/jupyter/homework/panxizhi/openai-translator/ai_translator/book/__init__.py new file mode 100644 index 00000000..5b688799 --- /dev/null +++ b/langchain/jupyter/homework/panxizhi/openai-translator/ai_translator/book/__init__.py @@ -0,0 +1,3 @@ +from .book import Book +from .page import Page +from .content import ContentType, Content, TableContent \ No newline at end of file diff --git a/langchain/jupyter/homework/panxizhi/openai-translator/ai_translator/book/book.py b/langchain/jupyter/homework/panxizhi/openai-translator/ai_translator/book/book.py new file mode 100644 index 00000000..b079357b --- /dev/null +++ b/langchain/jupyter/homework/panxizhi/openai-translator/ai_translator/book/book.py @@ -0,0 +1,9 @@ +from .page import Page + +class Book: + def __init__(self, pdf_file_path): + self.pdf_file_path = pdf_file_path + self.pages = [] + + def add_page(self, page: Page): + self.pages.append(page) \ No newline at end of file diff --git a/langchain/jupyter/homework/panxizhi/openai-translator/ai_translator/book/content.py b/langchain/jupyter/homework/panxizhi/openai-translator/ai_translator/book/content.py new file mode 100644 index 00000000..901c2a07 --- /dev/null +++ b/langchain/jupyter/homework/panxizhi/openai-translator/ai_translator/book/content.py @@ -0,0 +1,85 @@ +import pandas as pd + +from enum import Enum, auto +from PIL import Image as PILImage +from utils import LOG +from io import StringIO + +class ContentType(Enum): + TEXT = auto() + TABLE = auto() + IMAGE = auto() + +class Content: + def __init__(self, content_type, original, translation=None): + self.content_type = content_type + self.original = original + self.translation = translation + self.status = False + + def set_translation(self, translation, status): + if not self.check_translation_type(translation): + raise ValueError(f"Invalid translation type. Expected {self.content_type}, but got {type(translation)}") + self.translation = translation + self.status = status + + def check_translation_type(self, translation): + if self.content_type == ContentType.TEXT and isinstance(translation, str): + return True + elif self.content_type == ContentType.TABLE and isinstance(translation, list): + return True + elif self.content_type == ContentType.IMAGE and isinstance(translation, PILImage.Image): + return True + return False + + def __str__(self): + return self.original + + +class TableContent(Content): + def __init__(self, data, translation=None): + df = pd.DataFrame(data) + + # Verify if the number of rows and columns in the data and DataFrame object match + if len(data) != len(df) or len(data[0]) != len(df.columns): + raise ValueError("The number of rows and columns in the extracted table data and DataFrame object do not match.") + + super().__init__(ContentType.TABLE, df) + + def set_translation(self, translation, status): + try: + if not isinstance(translation, str): + raise ValueError(f"Invalid translation type. Expected str, but got {type(translation)}") + + LOG.debug(f"[translation]\n{translation}") + # Extract column names from the first set of brackets + header = translation.split(']')[0][1:].split(', ') + # Extract data rows from the remaining brackets + data_rows = translation.split('] ')[1:] + # Replace Chinese punctuation and split each row into a list of values + data_rows = [row[1:-1].split(', ') for row in data_rows] + # Create a DataFrame using the extracted header and data + translated_df = pd.DataFrame(data_rows, columns=header) + LOG.debug(f"[translated_df]\n{translated_df}") + self.translation = translated_df + self.status = status + except Exception as e: + LOG.error(f"An error occurred during table translation: {e}") + self.translation = None + self.status = False + + def __str__(self): + return self.original.to_string(header=False, index=False) + + def iter_items(self, translated=False): + target_df = self.translation if translated else self.original + for row_idx, row in target_df.iterrows(): + for col_idx, item in enumerate(row): + yield (row_idx, col_idx, item) + + def update_item(self, row_idx, col_idx, new_value, translated=False): + target_df = self.translation if translated else self.original + target_df.at[row_idx, col_idx] = new_value + + def get_original_as_str(self): + return self.original.to_string(header=False, index=False) \ No newline at end of file diff --git a/langchain/jupyter/homework/panxizhi/openai-translator/ai_translator/book/page.py b/langchain/jupyter/homework/panxizhi/openai-translator/ai_translator/book/page.py new file mode 100644 index 00000000..df12e772 --- /dev/null +++ b/langchain/jupyter/homework/panxizhi/openai-translator/ai_translator/book/page.py @@ -0,0 +1,8 @@ +from .content import Content + +class Page: + def __init__(self): + self.contents = [] + + def add_content(self, content: Content): + self.contents.append(content) diff --git a/langchain/jupyter/homework/panxizhi/openai-translator/ai_translator/flask_server.py b/langchain/jupyter/homework/panxizhi/openai-translator/ai_translator/flask_server.py new file mode 100644 index 00000000..7b5bed03 --- /dev/null +++ b/langchain/jupyter/homework/panxizhi/openai-translator/ai_translator/flask_server.py @@ -0,0 +1,71 @@ +import sys +import os + +sys.path.append(os.path.dirname(os.path.abspath(__file__))) + +from flask import Flask, request, send_file, jsonify +from translator import PDFTranslator, TranslationConfig +from utils import ArgumentParser, LOG + +app = Flask(__name__) + +TEMP_FILE_DIR = "flask_temps/" + +@app.route('/translation', methods=['POST']) +def translation(): + try: + input_file = request.files['input_file'] + source_language = request.form.get('source_language', 'English') + target_language = request.form.get('target_language', 'Chinese') + + LOG.debug(f"[input_file]\n{input_file}") + LOG.debug(f"[input_file.filename]\n{input_file.filename}") + + if input_file and input_file.filename: + # # 创建临时文件 + input_file_path = TEMP_FILE_DIR+input_file.filename + LOG.debug(f"[input_file_path]\n{input_file_path}") + + input_file.save(input_file_path) + + # 调用翻译函数 + output_file_path = Translator.translate_pdf( + input_file=input_file_path, + source_language=source_language, + target_language=target_language) + + # 移除临时文件 + # os.remove(input_file_path) + + # 构造完整的文件路径 + output_file_path = os.getcwd() + "/" + output_file_path + LOG.debug(output_file_path) + + # 返回翻译后的文件 + return send_file(output_file_path, as_attachment=True) + except Exception as e: + response = { + 'status': 'error', + 'message': str(e) + } + return jsonify(response), 400 + + +def initialize_translator(): + # 解析命令行 + argument_parser = ArgumentParser() + args = argument_parser.parse_arguments() + + # 初始化配置单例 + config = TranslationConfig() + config.initialize(args) + # 实例化 PDFTranslator 类,并调用 translate_pdf() 方法 + global Translator + Translator = PDFTranslator(config.model_name) + + +if __name__ == "__main__": + # 初始化 translator + initialize_translator() + # 启动 Flask Web Server + app.run(host="0.0.0.0", port=5000, debug=True) \ No newline at end of file diff --git a/langchain/jupyter/homework/panxizhi/openai-translator/ai_translator/gradio_server.py b/langchain/jupyter/homework/panxizhi/openai-translator/ai_translator/gradio_server.py new file mode 100644 index 00000000..67d4e846 --- /dev/null +++ b/langchain/jupyter/homework/panxizhi/openai-translator/ai_translator/gradio_server.py @@ -0,0 +1,80 @@ +import sys +import os +import gradio as gr + +sys.path.append(os.path.dirname(os.path.abspath(__file__))) + +from utils import ArgumentParser, LOG +from translator import PDFTranslator, TranslationConfig + +styleMap = { + "小说": "Novel", + "新闻稿": "News", + "李白风格": "LiBai-Style", +} + +last_model_name = "" +translators = {} + +def translation(input_file, style_name, fiel_format, model_name, source_language, target_language): + if input_file is None or model_name is None or style_name is None or fiel_format is None or source_language is None or target_language is None: + LOG.debug("未上传文件") + return None + + LOG.debug(f"[翻译任务]\n风格:{style_name}\n输出格式:{fiel_format}\n源文件: {input_file.name}\n源语言: {source_language}\n目标语言: {target_language}") + + global Translator, last_model_name , translators + if last_model_name != model_name: + last_model_name = model_name + if translators.get(model_name) is None: + translators[model_name] = PDFTranslator(model_name) + Translator = translators[model_name] + else: + Translator = translators[model_name] + + style_id = styleMap[style_name] + output_file_path = Translator.translate_pdf( + input_file.name,translate_style=style_id, output_file_format=fiel_format, source_language=source_language, target_language=target_language) + + return output_file_path + +def launch_gradio(): + + iface = gr.Interface( + fn=translation, + title="OpenAI-Translator v2.0(PDF 电子书翻译工具)", + inputs=[ + gr.File(label="上传PDF文件"), + gr.Dropdown(label="翻译风格", choices=["小说", "新闻稿","李白风格"], value="小说"), + gr.Dropdown(label="输出格式", choices=["PDF","Markdown"], value="PDF"), + gr.Dropdown(label="模型", choices=["gpt-3.5-turbo", "chatglm6b"], value="gpt-3.5-turbo"), + gr.Textbox(label="源语言(默认:英文)", placeholder="English", value="English"), + gr.Textbox(label="目标语言(默认:中文)", placeholder="Chinese", value="Chinese") + ], + outputs=[ + gr.File(label="下载翻译文件") + ], + allow_flagging="never", + live=True + ) + + iface.launch(server_name="0.0.0.0") + +def initialize_translator(): + # 解析命令行 + argument_parser = ArgumentParser() + args = argument_parser.parse_arguments() + + # 初始化配置单例 + config = TranslationConfig() + config.initialize(args) + # 实例化 PDFTranslator 类,并调用 translate_pdf() 方法 + global Translator + Translator = PDFTranslator(config.model_name) + + +if __name__ == "__main__": + # 初始化 translator + initialize_translator() + # 启动 Gradio 服务 + launch_gradio() diff --git a/langchain/jupyter/homework/panxizhi/openai-translator/ai_translator/main.py b/langchain/jupyter/homework/panxizhi/openai-translator/ai_translator/main.py new file mode 100644 index 00000000..33ae7282 --- /dev/null +++ b/langchain/jupyter/homework/panxizhi/openai-translator/ai_translator/main.py @@ -0,0 +1,20 @@ +import sys +import os + +sys.path.append(os.path.dirname(os.path.abspath(__file__))) + +from utils import ArgumentParser, LOG +from translator import PDFTranslator, TranslationConfig + +if __name__ == "__main__": + # 解析命令行 + argument_parser = ArgumentParser() + args = argument_parser.parse_arguments() + + # 初始化配置单例 + config = TranslationConfig() + config.initialize(args) + + # 实例化 PDFTranslator 类,并调用 translate_pdf() 方法 + translator = PDFTranslator(config.model_name) + translator.translate_pdf(config.input_file, config.output_file_format, pages=None) diff --git a/langchain/jupyter/homework/panxizhi/openai-translator/ai_translator/translator/__init__.py b/langchain/jupyter/homework/panxizhi/openai-translator/ai_translator/translator/__init__.py new file mode 100644 index 00000000..0e3fdcca --- /dev/null +++ b/langchain/jupyter/homework/panxizhi/openai-translator/ai_translator/translator/__init__.py @@ -0,0 +1,2 @@ +from .pdf_translator import PDFTranslator +from .translation_config import TranslationConfig \ No newline at end of file diff --git a/langchain/jupyter/homework/panxizhi/openai-translator/ai_translator/translator/exceptions.py b/langchain/jupyter/homework/panxizhi/openai-translator/ai_translator/translator/exceptions.py new file mode 100644 index 00000000..4f4c23c1 --- /dev/null +++ b/langchain/jupyter/homework/panxizhi/openai-translator/ai_translator/translator/exceptions.py @@ -0,0 +1,5 @@ +class PageOutOfRangeException(Exception): + def __init__(self, book_pages, requested_pages): + self.book_pages = book_pages + self.requested_pages = requested_pages + super().__init__(f"Page out of range: Book has {book_pages} pages, but {requested_pages} pages were requested.") diff --git a/langchain/jupyter/homework/panxizhi/openai-translator/ai_translator/translator/pdf_parser.py b/langchain/jupyter/homework/panxizhi/openai-translator/ai_translator/translator/pdf_parser.py new file mode 100644 index 00000000..6f2f9bc3 --- /dev/null +++ b/langchain/jupyter/homework/panxizhi/openai-translator/ai_translator/translator/pdf_parser.py @@ -0,0 +1,58 @@ +import pdfplumber +from typing import Optional +from book import Book, Page, Content, ContentType, TableContent +from translator.exceptions import PageOutOfRangeException +from utils import LOG + + +class PDFParser: + def __init__(self): + pass + + def parse_pdf(self, pdf_file_path: str, pages: Optional[int] = None) -> Book: + book = Book(pdf_file_path) + + with pdfplumber.open(pdf_file_path) as pdf: + if pages is not None and pages > len(pdf.pages): + raise PageOutOfRangeException(len(pdf.pages), pages) + + if pages is None: + pages_to_parse = pdf.pages + else: + pages_to_parse = pdf.pages[:pages] + + for pdf_page in pages_to_parse: + page = Page() + + # Store the original text content + raw_text = pdf_page.extract_text() + tables = pdf_page.extract_tables() + + # Remove each cell's content from the original text + for table_data in tables: + for row in table_data: + for cell in row: + raw_text = raw_text.replace(cell, "", 1) + + # Handling text + if raw_text: + # Remove empty lines and leading/trailing whitespaces + raw_text_lines = raw_text.splitlines() + cleaned_raw_text_lines = [line.strip() for line in raw_text_lines if line.strip()] + cleaned_raw_text = "\n".join(cleaned_raw_text_lines) + + text_content = Content(content_type=ContentType.TEXT, original=cleaned_raw_text) + page.add_content(text_content) + LOG.debug(f"[raw_text]\n {cleaned_raw_text}") + + + + # Handling tables + if tables: + table = TableContent(tables) + page.add_content(table) + LOG.debug(f"[table]\n{table}") + + book.add_page(page) + + return book diff --git a/langchain/jupyter/homework/panxizhi/openai-translator/ai_translator/translator/pdf_translator.py b/langchain/jupyter/homework/panxizhi/openai-translator/ai_translator/translator/pdf_translator.py new file mode 100644 index 00000000..a4fe9f12 --- /dev/null +++ b/langchain/jupyter/homework/panxizhi/openai-translator/ai_translator/translator/pdf_translator.py @@ -0,0 +1,30 @@ +from typing import Optional +from translator.pdf_parser import PDFParser +from translator.writer import Writer +from translator.translation_chain import TranslationChain +from utils import LOG + +class PDFTranslator: + def __init__(self, model_name: str): + self.translate_chain = TranslationChain(model_name) + self.pdf_parser = PDFParser() + self.writer = Writer() + + def translate_pdf(self, + input_file: str, + translate_style: str = None, + output_file_format: str = 'markdown', + source_language: str = "English", + target_language: str = 'Chinese', + pages: Optional[int] = None): + + self.book = self.pdf_parser.parse_pdf(input_file, pages) + + for page_idx, page in enumerate(self.book.pages): + for content_idx, content in enumerate(page.contents): + # Translate content.original + translation, status = self.translate_chain.run(content, source_language, target_language, translation_style=translate_style) + # Update the content in self.book.pages directly + self.book.pages[page_idx].contents[content_idx].set_translation(translation, status) + + return self.writer.save_translated_book(self.book, output_file_format) diff --git a/langchain/jupyter/homework/panxizhi/openai-translator/ai_translator/translator/translation_chain.py b/langchain/jupyter/homework/panxizhi/openai-translator/ai_translator/translator/translation_chain.py new file mode 100644 index 00000000..f3886ec4 --- /dev/null +++ b/langchain/jupyter/homework/panxizhi/openai-translator/ai_translator/translator/translation_chain.py @@ -0,0 +1,62 @@ +from langchain.chat_models import ChatOpenAI +from langchain.llms import ChatGLM +from langchain.chains import LLMChain + +from langchain.prompts.chat import ( + ChatPromptTemplate, + SystemMessagePromptTemplate, + HumanMessagePromptTemplate, +) + +from utils import LOG + +def createLLM(model_name: str = "gpt-3.5-turbo", verbose: bool = True): + if model_name == 'chatglm6b' : + chat = ChatGLM(endpoint_url='http://127.0.0.1:8000', + max_token=80000, + history=[], + top_p=0.9, + model_kwargs={"sample_model_args": False }, verbose=verbose) + else: + chat = ChatOpenAI(model_name=model_name, temperature=0, verbose=verbose) + + return chat + +class TranslationChain: + def __init__(self, model_name: str = "gpt-3.5-turbo", verbose: bool = True): + + # 翻译任务指令始终由 System 角色承担 + template = ( + """You are a translation expert, proficient in various languages. You're good at language style of {style_name}.\n + Translates {source_language} to {target_language}.""" + ) + system_message_prompt = SystemMessagePromptTemplate.from_template(template) + + # 待翻译文本由 Human 角色输入 + human_template = "{text}" + human_message_prompt = HumanMessagePromptTemplate.from_template(human_template) + + # 使用 System 和 Human 角色的提示模板构造 ChatPromptTemplate + chat_prompt_template = ChatPromptTemplate.from_messages( + [system_message_prompt, human_message_prompt] + ) + + # 为了翻译结果的稳定性,将 temperature 设置为 0 + chat = createLLM(model_name, verbose) + + self.chain = LLMChain(llm=chat, prompt=chat_prompt_template, verbose=verbose) + + def run(self, text: str, source_language: str, target_language: str, translation_style: str = "CNN Writer") -> (str, bool): + result = "" + try: + result = self.chain.run({ + "text": text, + "style_name": translation_style, + "source_language": source_language, + "target_language": target_language, + }) + except Exception as e: + LOG.error(f"An error occurred during translation: {e}") + return result, False + + return result, True \ No newline at end of file diff --git a/langchain/jupyter/homework/panxizhi/openai-translator/ai_translator/translator/translation_config.py b/langchain/jupyter/homework/panxizhi/openai-translator/ai_translator/translator/translation_config.py new file mode 100644 index 00000000..783823ae --- /dev/null +++ b/langchain/jupyter/homework/panxizhi/openai-translator/ai_translator/translator/translation_config.py @@ -0,0 +1,29 @@ +import yaml + +class TranslationConfig: + _instance = None + + def __new__(cls): + if cls._instance is None: + cls._instance = super(TranslationConfig, cls).__new__(cls) + cls._instance._config = None + return cls._instance + + def initialize(self, args): + with open(args.config_file, "r") as f: + config = yaml.safe_load(f) + + # Use the argparse Namespace to update the configuration + overridden_values = { + key: value for key, value in vars(args).items() if key in config and value is not None + } + config.update(overridden_values) + + # Store the original config dictionary + self._instance._config = config + + def __getattr__(self, name): + # Try to get attribute from _config + if self._instance._config and name in self._instance._config: + return self._instance._config[name] + raise AttributeError(f"'TranslationConfig' object has no attribute '{name}'") \ No newline at end of file diff --git a/langchain/jupyter/homework/panxizhi/openai-translator/ai_translator/translator/writer.py b/langchain/jupyter/homework/panxizhi/openai-translator/ai_translator/translator/writer.py new file mode 100644 index 00000000..90b51ed5 --- /dev/null +++ b/langchain/jupyter/homework/panxizhi/openai-translator/ai_translator/translator/writer.py @@ -0,0 +1,114 @@ +import os +from reportlab.lib import colors, pagesizes, units +from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle +from reportlab.pdfbase import pdfmetrics +from reportlab.pdfbase.ttfonts import TTFont +from reportlab.platypus import ( + SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak +) + +from book import Book, ContentType +from utils import LOG + +class Writer: + def __init__(self): + pass + + def save_translated_book(self, book: Book, ouput_file_format: str): + LOG.debug(ouput_file_format) + + if ouput_file_format.lower() == "pdf": + output_file_path = self._save_translated_book_pdf(book) + elif ouput_file_format.lower() == "markdown": + output_file_path = self._save_translated_book_markdown(book) + else: + LOG.error(f"不支持文件类型: {ouput_file_format}") + return "" + + LOG.info(f"翻译完成,文件保存至: {output_file_path}") + + return output_file_path + + + def _save_translated_book_pdf(self, book: Book, output_file_path: str = None): + + output_file_path = book.pdf_file_path.replace('.pdf', f'_translated.pdf') + + LOG.info(f"开始导出: {output_file_path}") + + # Register Chinese font + font_path = "../fonts/simsun.ttc" # 请将此路径替换为您的字体文件路径 + pdfmetrics.registerFont(TTFont("SimSun", font_path)) + + # Create a new ParagraphStyle with the SimSun font + simsun_style = ParagraphStyle('SimSun', fontName='SimSun', fontSize=12, leading=14) + + # Create a PDF document + doc = SimpleDocTemplate(output_file_path, pagesize=pagesizes.letter) + styles = getSampleStyleSheet() + story = [] + + # Iterate over the pages and contents + for page in book.pages: + for content in page.contents: + if content.status: + if content.content_type == ContentType.TEXT: + # Add translated text to the PDF + text = content.translation + para = Paragraph(text, simsun_style) + story.append(para) + + elif content.content_type == ContentType.TABLE: + # Add table to the PDF + table = content.translation + table_style = TableStyle([ + ('BACKGROUND', (0, 0), (-1, 0), colors.grey), + ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke), + ('ALIGN', (0, 0), (-1, -1), 'CENTER'), + ('FONTNAME', (0, 0), (-1, 0), 'SimSun'), # 更改表头字体为 "SimSun" + ('FONTSIZE', (0, 0), (-1, 0), 14), + ('BOTTOMPADDING', (0, 0), (-1, 0), 12), + ('BACKGROUND', (0, 1), (-1, -1), colors.beige), + ('FONTNAME', (0, 1), (-1, -1), 'SimSun'), # 更改表格中的字体为 "SimSun" + ('GRID', (0, 0), (-1, -1), 1, colors.black) + ]) + pdf_table = Table(table.values.tolist()) + pdf_table.setStyle(table_style) + story.append(pdf_table) + # Add a page break after each page except the last one + if page != book.pages[-1]: + story.append(PageBreak()) + + # Save the translated book as a new PDF file + doc.build(story) + return output_file_path + + + def _save_translated_book_markdown(self, book: Book, output_file_path: str = None): + output_file_path = book.pdf_file_path.replace('.pdf', f'_translated.md') + + LOG.info(f"开始导出: {output_file_path}") + with open(output_file_path, 'w', encoding='utf-8') as output_file: + # Iterate over the pages and contents + for page in book.pages: + for content in page.contents: + if content.status: + if content.content_type == ContentType.TEXT: + # Add translated text to the Markdown file + text = content.translation + output_file.write(text + '\n\n') + + elif content.content_type == ContentType.TABLE: + # Add table to the Markdown file + table = content.translation + header = '| ' + ' | '.join(str(column) for column in table.columns) + ' |' + '\n' + separator = '| ' + ' | '.join(['---'] * len(table.columns)) + ' |' + '\n' + # body = '\n'.join(['| ' + ' | '.join(row) + ' |' for row in table.values.tolist()]) + '\n\n' + body = '\n'.join(['| ' + ' | '.join(str(cell) for cell in row) + ' |' for row in table.values.tolist()]) + '\n\n' + output_file.write(header + separator + body) + + # Add a page break (horizontal rule) after each page except the last one + if page != book.pages[-1]: + output_file.write('---\n\n') + + return output_file_path \ No newline at end of file diff --git a/langchain/jupyter/homework/panxizhi/openai-translator/ai_translator/utils/__init__.py b/langchain/jupyter/homework/panxizhi/openai-translator/ai_translator/utils/__init__.py new file mode 100644 index 00000000..09b16931 --- /dev/null +++ b/langchain/jupyter/homework/panxizhi/openai-translator/ai_translator/utils/__init__.py @@ -0,0 +1,2 @@ +from .argument_parser import ArgumentParser +from .logger import LOG \ No newline at end of file diff --git a/langchain/jupyter/homework/panxizhi/openai-translator/ai_translator/utils/argument_parser.py b/langchain/jupyter/homework/panxizhi/openai-translator/ai_translator/utils/argument_parser.py new file mode 100644 index 00000000..57684d86 --- /dev/null +++ b/langchain/jupyter/homework/panxizhi/openai-translator/ai_translator/utils/argument_parser.py @@ -0,0 +1,15 @@ +import argparse + +class ArgumentParser: + def __init__(self): + self.parser = argparse.ArgumentParser(description='A translation tool that supports translations in any language pair.') + self.parser.add_argument('--config_file', type=str, default='config.yaml', help='Configuration file with model and API settings.') + self.parser.add_argument('--model_name', type=str, help='Name of the Large Language Model.') + self.parser.add_argument('--input_file', type=str, help='PDF file to translate.') + self.parser.add_argument('--output_file_format', type=str, help='The file format of translated book. Now supporting PDF and Markdown') + self.parser.add_argument('--source_language', type=str, help='The language of the original book to be translated.') + self.parser.add_argument('--target_language', type=str, help='The target language for translating the original book.') + + def parse_arguments(self): + args = self.parser.parse_args() + return args diff --git a/langchain/jupyter/homework/panxizhi/openai-translator/ai_translator/utils/logger.py b/langchain/jupyter/homework/panxizhi/openai-translator/ai_translator/utils/logger.py new file mode 100644 index 00000000..a252b50e --- /dev/null +++ b/langchain/jupyter/homework/panxizhi/openai-translator/ai_translator/utils/logger.py @@ -0,0 +1,32 @@ +from loguru import logger +import os +import sys + +LOG_FILE = "translation.log" +ROTATION_TIME = "02:00" + +class Logger: + def __init__(self, name="translation", log_dir="logs", debug=False): + if not os.path.exists(log_dir): + os.makedirs(log_dir) + log_file_path = os.path.join(log_dir, LOG_FILE) + + # Remove default loguru handler + logger.remove() + + # Add console handler with a specific log level + level = "DEBUG" if debug else "INFO" + logger.add(sys.stdout, level=level) + # Add file handler with a specific log level and timed rotation + logger.add(log_file_path, rotation=ROTATION_TIME, level="DEBUG") + self.logger = logger + +LOG = Logger(debug=True).logger + +if __name__ == "__main__": + log = Logger().logger + + log.debug("This is a debug message.") + log.info("This is an info message.") + log.warning("This is a warning message.") + log.error("This is an error message.") diff --git a/langchain/jupyter/homework/panxizhi/openai-translator/config.yaml b/langchain/jupyter/homework/panxizhi/openai-translator/config.yaml new file mode 100644 index 00000000..3ff6f1ec --- /dev/null +++ b/langchain/jupyter/homework/panxizhi/openai-translator/config.yaml @@ -0,0 +1,5 @@ +model_name: "gpt-3.5-turbo" +input_file: "tests/test.pdf" +output_file_format: "markdown" +source_language: "English" +target_language: "Chinese" \ No newline at end of file diff --git a/langchain/jupyter/homework/panxizhi/openai-translator/fonts/simsun.ttc b/langchain/jupyter/homework/panxizhi/openai-translator/fonts/simsun.ttc new file mode 100644 index 00000000..e64e92ed Binary files /dev/null and b/langchain/jupyter/homework/panxizhi/openai-translator/fonts/simsun.ttc differ diff --git a/langchain/jupyter/homework/panxizhi/openai-translator/images/sample_image_0.png b/langchain/jupyter/homework/panxizhi/openai-translator/images/sample_image_0.png new file mode 100644 index 00000000..9773db46 Binary files /dev/null and b/langchain/jupyter/homework/panxizhi/openai-translator/images/sample_image_0.png differ diff --git a/langchain/jupyter/homework/panxizhi/openai-translator/images/sample_image_1.png b/langchain/jupyter/homework/panxizhi/openai-translator/images/sample_image_1.png new file mode 100644 index 00000000..6af881ac Binary files /dev/null and b/langchain/jupyter/homework/panxizhi/openai-translator/images/sample_image_1.png differ diff --git a/langchain/jupyter/homework/panxizhi/openai-translator/jupyter/flask_client.ipynb b/langchain/jupyter/homework/panxizhi/openai-translator/jupyter/flask_client.ipynb new file mode 100644 index 00000000..b9049398 --- /dev/null +++ b/langchain/jupyter/homework/panxizhi/openai-translator/jupyter/flask_client.ipynb @@ -0,0 +1,117 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "a4e32dd8-6716-463e-8120-91682dd65830", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: requests in /root/miniconda3/envs/langchain/lib/python3.10/site-packages (2.31.0)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /root/miniconda3/envs/langchain/lib/python3.10/site-packages (from requests) (3.1.0)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /root/miniconda3/envs/langchain/lib/python3.10/site-packages (from requests) (2.0.4)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /root/miniconda3/envs/langchain/lib/python3.10/site-packages (from requests) (2023.5.7)\n", + "Requirement already satisfied: idna<4,>=2.5 in /root/miniconda3/envs/langchain/lib/python3.10/site-packages (from requests) (3.4)\n", + "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", + "\u001b[0m" + ] + } + ], + "source": [ + "!pip install requests" + ] + }, + { + "cell_type": "markdown", + "id": "eab5fa3d-3e27-4337-a891-85eda96ddf50", + "metadata": {}, + "source": [ + "# 使用 requests 库请求翻译服务 API" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "596c6c45-8b95-4beb-ae6f-b1c49aa4020c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Translation completed. Translated file saved as translated_output.md.\n" + ] + } + ], + "source": [ + "import requests\n", + "\n", + "# Flask服务器的地址\n", + "FLASK_SERVER_URL = 'http://localhost:5000'\n", + "\n", + "# 翻译服务接口\n", + "translation_url = f'{FLASK_SERVER_URL}/translation'\n", + "\n", + "# 要上传的文件路径\n", + "file_path = '../tests/test.pdf' # 修改为你的文件路径\n", + "\n", + "# 构建请求参数\n", + "params = {\n", + " 'source_language': 'English', # 修改为你的源语言\n", + " 'target_language': 'Chinese' # 修改为你的目标语言\n", + "}\n", + "\n", + "# 发送POST请求\n", + "with open(file_path, 'rb') as file:\n", + " files = {'input_file': file}\n", + " response = requests.post(translation_url, files=files, data=params)\n", + "\n", + "\n", + "# 翻译后文件\n", + "output_filename = \"translated_output.md\"\n", + "\n", + "# 处理响应\n", + "if response.status_code == 200:\n", + " # 保存翻译后的文件\n", + " with open(output_filename, 'wb') as output_file:\n", + " output_file.write(response.content)\n", + " print(f\"Translation completed. Translated file saved as {output_filename}.\")\n", + "else:\n", + " print(f\"Translation failed. Status code: {response.status_code}\")\n", + " print(response.text)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2da6286a-d92c-4eaf-abec-9f74cb7af599", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/langchain/jupyter/homework/panxizhi/openai-translator/jupyter/translation_chain.ipynb b/langchain/jupyter/homework/panxizhi/openai-translator/jupyter/translation_chain.ipynb new file mode 100644 index 00000000..c405ab58 --- /dev/null +++ b/langchain/jupyter/homework/panxizhi/openai-translator/jupyter/translation_chain.ipynb @@ -0,0 +1,611 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "id": "69cb775b-eb44-4e95-83c2-c087947d7ae4", + "metadata": {}, + "source": [ + "# LangChain 核心模块 - Chat Model 和 Chat Prompt Template\n", + "\n", + "希望通过此示例,让大家深入理解 LangChain 的聊天模型。简而言之:\n", + "- `Chat Model` 不止是一个用于聊天对话的模型抽象,更重要的是提供了`多角色`提示能力(System,AI,Human,Function)。\n", + "- `Chat Prompt Template` 则为开发者提供了便捷维护`不同角色`的`提示模板`与`消息记录`的接口。\n", + "\n", + "![](../../jupyter/images/model_io.jpeg)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2219b200-bb76-45b7-bb88-0525667c1b31", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "8a376a3d-ce1f-4aab-9a2a-6224fce2edfd", + "metadata": {}, + "source": [ + "## 温故:LangChain Chat Model 使用方法和流程\n", + "\n", + "在最终调用 `Chat Model` 时,一定是直接传入`LangChain Schema Messages(消息记录)` \n", + "\n", + "```python\n", + "from langchain.chat_models import ChatOpenAI\n", + "chat_model = ChatOpenAI(model_name=\"gpt-3.5-turbo\")\n", + "\n", + "from langchain.schema import (\n", + " AIMessage,\n", + " HumanMessage,\n", + " SystemMessage\n", + ")\n", + "\n", + "messages = [SystemMessage(content=\"You are a helpful assistant.\"),\n", + " HumanMessage(content=\"Who won the world series in 2020?\"),\n", + " AIMessage(content=\"The Los Angeles Dodgers won the World Series in 2020.\"), \n", + " HumanMessage(content=\"Where was it played?\")]\n", + "\n", + "print(messages)\n", + "\n", + "chat_model(messages)\n", + "```\n", + "\n", + "打印 messages 输出结果:\n", + "```\n", + "[\n", + " SystemMessage(content=\"You are a helpful assistant.\", additional_kwargs={}),\n", + " HumanMessage(\n", + " content=\"Who won the world series in 2020?\", additional_kwargs={}, example=False\n", + " ),\n", + " AIMessage(\n", + " content=\"The Los Angeles Dodgers won the World Series in 2020.\",\n", + " additional_kwargs={},\n", + " example=False,\n", + " ),\n", + " HumanMessage(content=\"Where was it played?\", additional_kwargs={}, example=False),\n", + "]\n", + "```\n", + "\n", + "调用 chat_model(messages) 返回结果:\n", + "\n", + "```\n", + "AIMessage(\n", + " content=\"The 2020 World Series was played at Globe Life Field in Arlington, Texas.\",\n", + " additional_kwargs={},\n", + " example=False,\n", + ")\n", + "\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "44337cac-457c-4971-b382-498dec3e3c24", + "metadata": {}, + "source": [ + "## 使用 System 和 Human 角色的提示模板构造 ChatPromptTemplate\n", + "\n", + "使用 `ChatPromptTemplate.from_messages` 方法,类似使用和维护`messages`的方式,构造 `chat_prompt_template`" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "0e4d76a2-1ccb-421a-9be2-1a4a6faa3b8b", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.schema import AIMessage, HumanMessage, SystemMessage\n", + "# 导入 Chat Model 即将使用的 Prompt Templates\n", + "from langchain.prompts.chat import (\n", + " ChatPromptTemplate,\n", + " SystemMessagePromptTemplate,\n", + " AIMessagePromptTemplate,\n", + " HumanMessagePromptTemplate,\n", + ")\n", + "\n", + "# 翻译任务指令始终由 System 角色承担\n", + "template = (\n", + " \"\"\"You are a translation expert, proficient in various languages. \\n\n", + " Translates English to Chinese.\"\"\"\n", + ")\n", + "system_message_prompt = SystemMessagePromptTemplate.from_template(template)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "596d74c4-0fa5-4308-9920-c8002a3ee0c7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "prompt=PromptTemplate(input_variables=[], output_parser=None, partial_variables={}, template='You are a translation expert, proficient in various languages. \\n\\n Translates English to Chinese.', template_format='f-string', validate_template=True) additional_kwargs={}\n" + ] + } + ], + "source": [ + "print(system_message_prompt)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "d2a8fddd-31d7-493f-94a1-4c1ec4608de6", + "metadata": {}, + "outputs": [], + "source": [ + "# 待翻译文本由 Human 角色输入\n", + "human_template = \"{text}\"\n", + "human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "8922d062-00a9-48dd-b1e2-ed2d2f61a9ad", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "prompt=PromptTemplate(input_variables=['text'], output_parser=None, partial_variables={}, template='{text}', template_format='f-string', validate_template=True) additional_kwargs={}\n" + ] + } + ], + "source": [ + "print(human_message_prompt)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "125cef74-88a0-4775-a513-06692592d801", + "metadata": {}, + "outputs": [], + "source": [ + "# 使用 System 和 Human 角色的提示模板构造 ChatPromptTemplate\n", + "chat_prompt_template = ChatPromptTemplate.from_messages(\n", + " [system_message_prompt, human_message_prompt]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "d16f01ad-b7c9-4a0a-af8f-698f55d4762e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "input_variables=['text'] output_parser=None partial_variables={} messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], output_parser=None, partial_variables={}, template='You are a translation expert, proficient in various languages. \\n\\n Translates English to Chinese.', template_format='f-string', validate_template=True), additional_kwargs={}), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['text'], output_parser=None, partial_variables={}, template='{text}', template_format='f-string', validate_template=True), additional_kwargs={})]\n" + ] + } + ], + "source": [ + "print(chat_prompt_template)" + ] + }, + { + "cell_type": "markdown", + "id": "406a2804-96f5-4470-9464-a96572e31149", + "metadata": {}, + "source": [ + "### 规范化 Python 复杂对象\n", + "\n", + "- 使用在线工具 [Python Formatter](https://codebeautify.org/python-formatter-beautifier) \n", + "- 规范化 `chat_prompt_template`后再查看\n", + "- 注意:不要同事输入多个复杂对象\n", + "\n", + "```json\n", + "messages = [\n", + " SystemMessagePromptTemplate(\n", + " prompt=PromptTemplate(\n", + " input_variables=[],\n", + " output_parser=None,\n", + " partial_variables={},\n", + " template=\"You are a translation expert, proficient in various languages. \\n\\n Translates English to Chinese.\",\n", + " template_format=\"f-string\",\n", + " validate_template=True,\n", + " ),\n", + " additional_kwargs={},\n", + " ),\n", + " HumanMessagePromptTemplate(\n", + " prompt=PromptTemplate(\n", + " input_variables=[\"text\"],\n", + " output_parser=None,\n", + " partial_variables={},\n", + " template=\"{text}\",\n", + " template_format=\"f-string\",\n", + " validate_template=True,\n", + " ),\n", + " additional_kwargs={},\n", + " ),\n", + "]\n", + "\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "b5cf9238-9497-46c3-961f-860e797fcece", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "ChatPromptValue(messages=[SystemMessage(content='You are a translation expert, proficient in various languages. \\n\\n Translates English to Chinese.', additional_kwargs={}), HumanMessage(content='I love programming.', additional_kwargs={}, example=False)])" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 生成用于翻译的 Chat Prompt\n", + "chat_prompt_template.format_prompt(text=\"I love programming.\")" + ] + }, + { + "cell_type": "markdown", + "id": "d3a39ff5-2720-40db-80f4-913c8c622bfa", + "metadata": {}, + "source": [ + "## 使用 chat_prompt_template.to_messages 方法生成 Messages" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "a21508c5-4b13-4d0e-aa67-c56d554d2626", + "metadata": {}, + "outputs": [], + "source": [ + "# 生成聊天模型真正可用的消息记录 Messages\n", + "chat_prompt = chat_prompt_template.format_prompt(text=\"I love programming.\").to_messages()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "a158a289-e738-45e8-b440-3f857a708e57", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[SystemMessage(content='You are a translation expert, proficient in various languages. \\n\\n Translates English to Chinese.', additional_kwargs={}),\n", + " HumanMessage(content='I love programming.', additional_kwargs={}, example=False)]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chat_prompt" + ] + }, + { + "cell_type": "markdown", + "id": "56e2fa43-ba3c-401f-bba0-6bb27050a363", + "metadata": {}, + "source": [ + "## 使用 Chat Model(GPT-3.5-turbo)实际执行翻译任务\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "5d4e3bd9-fac3-4b67-9285-74e1793f9e74", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.chat_models import ChatOpenAI\n", + "\n", + "# 为了翻译结果的稳定性,将 temperature 设置为 0\n", + "translation_model = ChatOpenAI(model_name=\"gpt-3.5-turbo\", temperature=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "c46d710c-9892-4ce2-8dcf-c3450dae8d83", + "metadata": {}, + "outputs": [], + "source": [ + "translation_result = translation_model(chat_prompt)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "63d7cadf-515a-41cd-82d7-1498a3d6a35f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "AIMessage(content='我喜欢编程。', additional_kwargs={}, example=False)" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "translation_result" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "d56c7a65-bc6c-4e19-8176-ddc457696d0b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "我喜欢编程。\n" + ] + } + ], + "source": [ + "# 查看翻译结果\n", + "print(translation_result.content)" + ] + }, + { + "cell_type": "markdown", + "id": "75f3e72a-90e0-40dc-afa8-0ff96c2b7d72", + "metadata": {}, + "source": [ + "## 使用 LLMChain 简化重复构造 ChatPrompt" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "4a57ee8c-dad1-429c-a791-2797a7b0c4ef", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.chains import LLMChain\n", + "\n", + "# 无需再每次都使用 to_messages 方法构造 Chat Prompt\n", + "translation_chain = LLMChain(llm=translation_model, prompt=chat_prompt_template)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "723cebfa-a3bd-434f-9992-9ec25833acdf", + "metadata": {}, + "outputs": [], + "source": [ + "# 等价于 translation_result.content (字符串类型)\n", + "chain_result = translation_chain.run({'text': \"I love programming.\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "21e1d186-88de-4ca8-ad32-c233d99145c4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "我喜欢编程。\n" + ] + } + ], + "source": [ + "print(chain_result)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "faddd232-0308-45ce-b2a6-899f261abe57", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'我喜欢人工智能和大型语言模型。'" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "translation_chain.run({'text': \"I love AI and Large Language Model.\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "4f3dcb51-7e3f-4787-84aa-60f45def661e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'[水果, 颜色, 价格(美元)] [苹果, 红色, 1.20] [香蕉, 黄色, 0.50] [橙子, 橙色, 0.80] [草莓, 红色, 2.50] [蓝莓, 蓝色, 3.00] [猕猴桃, 绿色, 1.00] [芒果, 橙色, 1.50] [葡萄, 紫色, 2.00]'" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "translation_chain.run({'text': \"[Fruit, Color, Price (USD)] [Apple, Red, 1.20] [Banana, Yellow, 0.50] [Orange, Orange, 0.80] [Strawberry, Red, 2.50] [Blueberry, Blue, 3.00] [Kiwi, Green, 1.00] [Mango, Orange, 1.50] [Grape, Purple, 2.00]\"})\n" + ] + }, + { + "cell_type": "markdown", + "id": "a47a1bfd-2beb-41eb-a1c1-c6443e4ec129", + "metadata": {}, + "source": [ + "## 扩展:支持多语言对翻译" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "bb9b37f6-4bcb-450c-b3c6-e476a8f15bc7", + "metadata": {}, + "outputs": [], + "source": [ + "# System 增加 source_language 和 target_language\n", + "template = (\n", + " \"\"\"You are a translation expert, proficient in various languages. \\n\n", + " Translates {source_language} to {target_language}.\"\"\"\n", + ")\n", + "system_message_prompt = SystemMessagePromptTemplate.from_template(template)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "2fccc32f-2b3f-4666-82be-607c452893b7", + "metadata": {}, + "outputs": [], + "source": [ + "# 待翻译文本由 Human 角色输入\n", + "human_template = \"{text}\"\n", + "human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "fd937e81-c54f-49d9-9422-c43670d8e5db", + "metadata": {}, + "outputs": [], + "source": [ + "# 使用 System 和 Human 角色的提示模板构造 ChatPromptTemplate\n", + "m_chat_prompt_template = ChatPromptTemplate.from_messages(\n", + " [system_message_prompt, human_message_prompt]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "ab237375-59ab-40ac-986b-7254989b3418", + "metadata": {}, + "outputs": [], + "source": [ + "m_translation_chain = LLMChain(llm=translation_model, prompt=m_chat_prompt_template) " + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "e728621c-008e-4c2b-bb57-483f1ee4c1e8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'I enjoy studying large language models, as it is easy, simple, and enjoyable.'" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "m_translation_chain.run({\n", + " \"source_language\": \"Chinese\",\n", + " \"target_language\": \"English\",\n", + " \"text\": \"我喜欢学习大语言模型,轻松简单又愉快\",\n", + "})" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "183455b3-9cf1-42b4-be91-9c699248266a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'私は大規模言語モデルの学習が好きです。それは簡単で楽しいです。'" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "m_translation_chain.run({\n", + " \"source_language\": \"Chinese\",\n", + " \"target_language\": \"Japanese\",\n", + " \"text\": \"我喜欢学习大语言模型,轻松简单又愉快\",\n", + "})" + ] + }, + { + "cell_type": "markdown", + "id": "d42bf232-d2ef-47d8-9029-97940d7f784a", + "metadata": {}, + "source": [ + "## Homework\n", + "- 尝试不同的 System Prompt 和 Chat Model,对比翻译效果。\n", + "- 根据翻译任务的使用场景,是否可以在初次传入 source_language 和 target_language 后不再更新?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "38ee618a-bbaa-476f-80e6-8d22a81fddf6", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/langchain/jupyter/homework/panxizhi/openai-translator/requirements.txt b/langchain/jupyter/homework/panxizhi/openai-translator/requirements.txt new file mode 100644 index 00000000..a331ebea --- /dev/null +++ b/langchain/jupyter/homework/panxizhi/openai-translator/requirements.txt @@ -0,0 +1,12 @@ +pdfplumber +simplejson +requests +PyYAML +pillow +reportlab +pandas +loguru +openai +langchain +gradio +flask \ No newline at end of file diff --git a/langchain/jupyter/homework/panxizhi/sales_chatbot/create_sales_vector_dbs.py b/langchain/jupyter/homework/panxizhi/sales_chatbot/create_sales_vector_dbs.py new file mode 100644 index 00000000..f3d38423 --- /dev/null +++ b/langchain/jupyter/homework/panxizhi/sales_chatbot/create_sales_vector_dbs.py @@ -0,0 +1,26 @@ +from langchain.embeddings.openai import OpenAIEmbeddings +from langchain.text_splitter import CharacterTextSplitter +from langchain.vectorstores import FAISS + +from langchain.text_splitter import CharacterTextSplitter + +def load_data_to_vectordb(file, db_name): + with open(file, encoding='utf8') as f: + text = f.read() + + text_splitter = CharacterTextSplitter( + separator = r'$\d+\.\n', + chunk_size = 0, + chunk_overlap = 0, + length_function = len, + is_separator_regex = True, + ) + docs = text_splitter.create_documents([text]) + + db = FAISS.from_documents(docs, OpenAIEmbeddings()) + + db.save_local(db_name) + +if __name__ == "__main__": + load_data_to_vectordb("data/sales_eletronic_stuffs.txt", "sales_eletronic_stuffs") + load_data_to_vectordb("data/real_estate_sales_data.txt", "real_estate_sales_data") \ No newline at end of file diff --git a/langchain/jupyter/homework/panxizhi/sales_chatbot/data/real_estate_sales_data.txt b/langchain/jupyter/homework/panxizhi/sales_chatbot/data/real_estate_sales_data.txt new file mode 100644 index 00000000..453fb6a8 --- /dev/null +++ b/langchain/jupyter/homework/panxizhi/sales_chatbot/data/real_estate_sales_data.txt @@ -0,0 +1,279 @@ +1. +[客户问题] 这个小区交通便利吗? +[销售回答] 当然了,这个小区距离地铁站只有几分钟的步行距离,而且附近有多条公交线路,非常方便。 + +2. +[客户问题] 我担心楼下太吵。 +[销售回答] 这个小区特别注重居住体验,我们有良好的隔音设计,并且小区内部规划了绿化区域,可以有效降低噪音。 + +3. +[客户问题] 我看房价还在涨,这个投资回报怎么样? +[销售回答] 这个区域未来有大量的商业和基础设施建设,所以从长期来看,投资回报非常有保证。 + +4. +[客户问题] 有没有学校? +[销售回答] 附近有多所优质的学校,非常适合有孩子的家庭。 + +5. +[客户问题] 物业管理怎么样? +[销售回答] 我们的物业管理得到了业主一致好评,服务非常到位。 + +6. +[客户问题] 我想要南向的房子。 +[销售回答] 很好,我们确实有一些朝南的单位,它们的采光特别好。 + +7. +[客户问题] 这个小区安全吗? +[销售回答] 当然,我们24小时安保巡逻,还有先进的监控系统。 + +8. +[客户问题] 预计什么时候交房? +[销售回答] 根据目前的进度,我们预计将在明年底交房。 + +9. +[客户问题] 我不想要一楼的房子。 +[销售回答] 我理解您的顾虑,我们还有多个楼层的房源可以选择。 + +10. +[客户问题] 有优惠吗? +[销售回答] 当然,如果您现在下订,我们可以给您一些优惠。 + +11. +[客户问题] 你们是否提供按揭服务? +[销售回答] 是的,我们与多家银行合作,可以帮助您快速办理按揭。 + +12. +[客户问题] 税费怎么算? +[销售回答] 我们可以提供详细的税费咨询服务,确保您清楚所有费用。 + +13. +[客户问题] 附近有医院吗? +[销售回答] 是的,附近有多家大型医院,医疗资源非常丰富。 + +14. +[客户问题] 我担心小区会很拥挤。 +[销售回答] 这个小区总体规划非常合理,保证了每个单元之间有足够的空间。 + +15. +[客户问题] 这个小区有游泳池和健身房吗? +[销售回答] 当然,我们提供全方位的生活设施,包括游泳池和健身房。 + +16. +[客户问题] 我需要两个停车位,怎么办? +[销售回答] 我们的小区有充足的停车位,并且可以额外购买,确保您的停车需求得到满足。 + +17. +[客户问题] 小区里有公园吗? +[销售回答] 是的,我们有专门设计的绿化区和儿童游乐园,确保您和家人能享受到高质量的生活。 + +18. +[客户问题] 我看到附近有个建筑工地。 +[销售回答] 那个工地预计很快就会完工,而且我们的窗户有很好的隔音效果。 + +19. +[客户问题] 房子是精装还是毛坯? +[销售回答] 我们提供多种选择,包括精装和毛坯,以满足不同客户的需求。 + +20. +[客户问题] 这里有商业区吗? +[销售回答] 是的,小区附近有大型购物中心和各种餐厅,非常方便。 + +21. +[客户问题] 我需要大阳台。 +[销售回答] 我们有一些户型设计带有大阳台,您可以在那里种花、晒衣或者放一个小茶几,非常宜人。 + +22. +[客户问题] 我担心电梯不够用。 +[销售回答] 每栋楼都配有多部高速电梯,基本上不用等待。 + +23. +[客户问题] 垃圾处理怎么样? +[销售回答] 我们有专门的垃圾分类和回收系统,非常环保。 + +24. +[客户问题] 我担心交房延期。 +[销售回答] 我们是知名的房地产开发商,一直严格按照计划执行,您完全可以放心。 + +25. +[客户问题] 我看这里有很多空置房。 +[销售回答] 这些房源都是新推出的,我们预计在近期内会有更多的家庭搬入。 + +26. +[客户问题] 楼下有便利店吗? +[销售回答] 当然,小区内就有便利店,满足您日常生活的基本需求。 + +27. +[客户问题] 我需要分期付款。 +[销售回答] 我们有多种付款方案,包括分期付款,以减轻您的经济压力。 + +28. +[客户问题] 我家有宠物,可以养吗? +[销售回答] 当然,我们是一个宠物友好的社区,但需要遵守一些基本规则。 + +29. +[客户问题] 我想看下水电气等设施。 +[销售回答] 没问题,我们使用的都是品牌供应商,质量非常可靠。 + +30. +[客户问题] 你们会提供家具吗? +[销售回答] 我们的精装房会提供基础家具和家电,让您拎包入住。 + +31. +[客户问题] 都有哪些户型? +[销售回答] 我们有从一室到四室不等的多种户型,定能满足您不同的居住需求。 + +32. +[客户问题] 我听说这个区域交通常常堵塞。 +[销售回答] 我们小区的位置实际上是绕开了主要交通瓶颈区,所以相对来说会流畅很多。 + +33. +[客户问题] 我很关注环境保护,你们有做到吗? +[销售回答] 非常好的问题,我们的建筑采用环保材料,并且在小区规划中也有绿化区域和垃圾分类系统。 + +34. +[客户问题] 楼下会有商铺吗? +[销售回答] 是的,楼下将设有商铺,但都是服务性质的,不会影响您的居住体验。 + +35. +[客户问题] 社区活动丰富吗? +[销售回答] 我们有专门的社区活动中心,常常组织各种活动,非常丰富。 + +36. +[客户问题] 你们有提供家政服务吗? +[销售回答] 我们与多家家政服务公司有合作,可以方便您的日常生活。 + +37. +[客户问题] 我担心物业费会很高。 +[销售回答] 我们的物业费是根据市场和服务水平来设定的,绝对物有所值。 + +38. +[客户问题] 附近有地铁站吗? +[销售回答] 附近就有地铁站,而且有多条公交线路经过,出行非常方便。 + +39. +[客户问题] 这里网络信号怎么样? +[销售回答] 我们有完善的网络基础设施,信号覆盖非常好。 + +40. +[客户问题] 我想先租赁,观察一下再买。 +[销售回答] 这也是一个不错的选择,我们也提供租赁服务,您可以先体验一下。 + +41. +[客户问题] 有没有燃气管道? +[销售回答] 当然有,我们的所有单位都连接了燃气管道,非常安全和便利。 + +42. +[客户问题] 我是投资购房,租金回报如何? +[销售回答] 这个区域的租赁市场非常火爆,您投资购房后,租金回报率相当不错。 + +43. +[客户问题] 我喜欢安静,这里噪音大吗? +[销售回答] 我们特意进行了隔音设计,并且小区内部也有绿化带,整体非常安静。 + +44. +[客户问题] 我看这个小区还在建设中。 +[销售回答] 是的,目前正在建设中,但已经接近尾声,很快就会完工。 + +45. +[客户问题] 有哪些休闲设施? +[销售回答] 我们有游泳池、健身房、图书馆以及儿童游乐场等多种休闲设施。 + +46. +[客户问题] 我担心暖气不足。 +[销售回答] 我们使用的是中央供暖系统,温度可以保证。 + +47. +[客户问题] 需要交多少定金? +[销售回答] 定金金额取决于您选择的房型和付款方式,我们可以详细为您解释。 + +48. +[客户问题] 哪些银行可以贷款? +[销售回答] 我们与多家知名银行合作,贷款流程非常方便。 + +49. +[客户问题] 我可以改变房间的布局吗? +[销售回答] 对于毛坯房,您有更多的自由度来改变房间布局。 + +50. +[客户问题] 你们怎么解决停车问题? +[销售回答] 我们有地下车库和多个停车位,确保每个家庭都有地方停车。 + +51. +[客户问题] 这里安全吗? +[销售回答] 非常安全。我们有24小时的安保人员和先进的监控系统。 + +52. +[客户问题] 我需要比较大的储藏空间。 +[销售回答] 我们的设计包括了充足的储藏空间,以满足您的需求。 + +53. +[客户问题] 这里的电梯需要等很久吗? +[销售回答] 不会的,我们的电梯系统高效可靠,响应速度很快。 + +54. +[客户问题] 你们接受定制吗? +[销售回答] 当然,对于特定的户型,我们提供一定程度的定制服务。 + +55. +[客户问题] 我看周围没有学校。 +[销售回答] 其实附近就有几所知名的学校,并且我们也在考虑未来在社区内建立教育设施。 + +56. +[客户问题] 我有老人需要照顾,有便利吗? +[销售回答] 我们小区有无障碍通道和专门的老年活动中心,非常适合老年人。 + +57. +[客户问题] 这里会不会很吵? +[销售回答] 我们有良好的隔音设计和规划,内部环境非常宁静。 + +58. +[客户问题] 这个价位对我来说有点高。 +[销售回答] 我们有不同户型和付款方案,一定有适合您预算的。 + +59. +[客户问题] 物业管理怎么样? +[销售回答] 我们的物业管理得到业主一致好评,服务周到且高效。 + +60. +[客户问题] 如果我想转卖这个房子呢? +[销售回答] 我们的房子位于黄金地段,升值潜力很大,转卖起来不会有问题。 + +61. +[客户问题] 这个小区规模太大了。 +[销售回答] 实际上,大规模的社区能提供更多的设施和更完善的服务。 + +62. +[客户问题] 交通如何? +[销售回答] 交通非常便利,不仅靠近地铁站,还有多条公交线路。 + +63. +[客户问题] 附近有医院吗? +[销售回答] 有的,距离我们小区不远就有几家大型综合医院。 + +64. +[客户问题] 我担心窗户太小,采光不好。 +[销售回答] 我们所有的户型都经过精心设计,以确保最佳的采光和通风。 + +65. +[客户问题] 这个房子看起来太小了。 +[销售回答] 虽然面积可能不大,但我们的设计使得每一寸空间都得到了充分利用。 + +66. +[客户问题] 我看附近都是老房子。 +[销售回答] 这个区域正在快速发展,未来的升值潜力非常大。 + +67. +[客户问题] 有没有健身房? +[销售回答] 当然,我们的小区内有设备齐全的健身房。 + +68. +[客户问题] 我需要考虑一下。 +[销售回答] 当然,这是一个重要的决定。但我建议您不要等太久,毕竟好房不等人。 + +69. +[客户问题] 我担心楼下的商业活动会很吵。 +[销售回答] 我们在规划时就已经考虑到这一点,商业区和居住区有一定的距离和隔音设计。 + +70. +[客户问题] 我担心新房会有甲醛。 +[销售回答] 我们使用环保材料建造,并且所有的新房在交付前都会进行甲醛检测。 \ No newline at end of file diff --git a/langchain/jupyter/homework/panxizhi/sales_chatbot/data/sales_eletronic_stuffs.txt b/langchain/jupyter/homework/panxizhi/sales_chatbot/data/sales_eletronic_stuffs.txt new file mode 100644 index 00000000..3bdf8190 --- /dev/null +++ b/langchain/jupyter/homework/panxizhi/sales_chatbot/data/sales_eletronic_stuffs.txt @@ -0,0 +1,185 @@ + +1. +顾客:你好,我想购买一台新的冰箱。你能推荐一款吗? +销售员:当然可以!您需要多大的容量?还有其他特殊需求吗? +顾客:我需要一个容量适中的冰箱,最好带有冷冻功能。 +销售员:我推荐我们新款的ABC型冰箱,容量适中,配备了先进的冷冻技术,可以满足您的需求。 + +2. +顾客:我需要购买一台新的洗衣机,但我对不同型号和功能不太了解。 +销售员:没问题,我可以帮您解决疑惑。您对洗衣机的容量有任何要求吗? +顾客:我需要一台大容量的洗衣机,最好具备快速洗涤和节能功能。 +销售员:我们的XYZ型洗衣机是一款高容量的洗衣机,具备快速洗涤选项,并且符合节能标准。 + +3. +顾客:我想买一台新的电视,但我对最新的技术不太熟悉。有什么推荐吗? +销售员:当然!您对屏幕尺寸有任何要求吗?还有您对高清图像或智能功能有兴趣吗? +顾客:我需要一台大屏幕的电视,最好支持高清图像和智能功能。 +销售员:我们的EFG型电视是一款大屏幕电视,具备高清图像和智能功能,您会对其表现非常满意。 + +4. +顾客:我正在寻找一台新的空调,怎样才能选择合适的型号? +销售员:选择空调时,您需要考虑房间的大小和您对温度控制的需求。您对能效有任何要求吗? +顾客:我需要一台适合中等大小房间的空调,最好具备节能功能。 +销售员:我们的HIJ型空调是一款适合中等大小房间的空调,具备优异的节能性能,可以帮助您降低能源消耗。 + +5. +顾客:我想购买一台新的咖啡机,但我对不同类型和品牌不太了解。 +销售员:没问题,咖啡机有很多选择,我可以帮您找到适合您的。您更喜欢滴漏咖啡机还是胶囊咖啡机? +顾客:我更喜欢胶囊咖啡机,方便简单。 +销售员:我们的KLM型胶囊咖啡机是一款简便易用的咖啡机,配备了多种口味的咖啡胶囊,适合您的需求。 + +6. +顾客:我需要购买一个新的微波炉,我应该选择哪种类型? +销售员:微波炉有各种类型,您更注重烹饪功能还是外观设计? +顾客:我更关注烹饪功能,我希望能有多种预设菜单和解冻功能。 +销售员:我们的NOP型微波炉具备多种预设菜单和解冻功能,可以满足您的需求。 + +7. +顾客:我想购买一个新的音响系统,我应该选择有线还是无线? +销售员:选择音响系统时,您需要考虑您的音源设备和使用环境。您更喜欢方便携带的无线音响还是更稳定的有线连接? +顾客:我更喜欢无线音响,因为我可以在不同房间中移动。 +销售员:我们的QRS型无线音响系统具备稳定的无线连接和高质量音效,适合您的需求。 + +8. +顾客:我需要购买一台新的吸尘器,我应该选择立式还是手持式? +销售员:您打算用吸尘器清理哪些区域?如果需要清理大面积地板,立式吸尘器可能更方便。手持吸尘器适合清理狭小区域和车辆内部。 +顾客:我主要需要清理地板,我会选择立式吸尘器。 +销售员:我们的TUV型立式吸尘器具备强大的吸力和易于操作的设计,适合清理地板和地毯。 + +9. +顾客:我想购买一个新的电饭煲,有什么推荐吗? +销售员:电饭煲有各种型号和功能,您对容量有任何要求吗?还有您是否需要多种烹饪模式? +顾客:我需要一个中等容量的电饭煲,并且希望具备煮粥和蒸煮功能。 +销售员:我们的WXY型电饭煲容量适中,具备多种烹饪模式,包括煮粥和蒸煮功能,符合您的需求。 + +10. +顾客:我正在寻找一台新的笔记本电脑,有什么建议吗? +销售员:笔记本电脑有不同的品牌和配置,您对使用场景和性能有任何要求吗?例如,处理器速度、内存容量和存储空间等。 +顾客:我需要一台适合办公和娱乐的笔记本电脑,性能稳定且存储空间充足。 +销售员:我们的YZX型笔记本电脑是一款适合办公和娱乐的选择,具备稳定的性能和大容量存储空间,可以满足您的需求。 + + +11. +顾客:我需要购买一台新的烤箱,有什么特别的功能可以推荐吗? +销售员:烤箱有各种特色功能,您对烘烤、烤制还是烧烤有更多兴趣? +顾客:我对烘烤和烤制食物很感兴趣。 +销售员:我们的ABC型烤箱具备多种预设烘烤程序和温度控制,可以满足您的烘烤和烤制需求。 + +12. +顾客:我正在考虑购买一个新的电动牙刷,有什么品牌和型号可以推荐吗? +销售员:电动牙刷有多个品牌和型号,您对清洁效果、震动频率还是附加功能有任何要求? +顾客:我希望能有一个高效的清洁效果,还可以有按摩牙龈的功能。 +销售员:我们的DEF型电动牙刷具备高效的清洁效果和牙龈按摩功能,可以满足您的需求。 + +13. +顾客:我想购买一个新的电动剃须刀,什么型号适合我? +销售员:电动剃须刀有旋转式和网状式两种类型,您对剃须方式有任何偏好吗? +顾客:我更喜欢旋转式剃须刀。 +销售员:我们的GHI型旋转式电动剃须刀具备高速旋转刀头和舒适的剃须体验,适合您的需求。 + +14. +顾客:我需要购买一个新的搅拌机,有什么功能可以推荐吗? +销售员:搅拌机有多种功能,您对打碎冰块、制作奶昔还是搅拌食材有任何特殊需求? +顾客:我希望搅拌机可以打碎冰块和制作奶昔。 +销售员:我们的JKL型搅拌机具备强大的破冰功能和多速度调节,可以满足您的需求。 + +15. +顾客:我正在寻找一台新的投影仪,有什么特点可以推荐吗? +销售员:投影仪有不同的分辨率、亮度和连接选项,您对使用场景和画质有任何特殊要求吗? +顾客:我需要一个高分辨率和亮度适中的投影仪,可以用于家庭影院。 +销售员:我们的MNO型投影仪具备高清晰度和适中的亮度,适合家庭影院和日常使用。 + +16. +顾客:我想购买一台新的洗衣机,有什么类型可以推荐吗? +销售员:洗衣机有滚筒式和波轮式两种类型,您对洗涤方式和洗衣容量有任何特殊要求吗? +顾客:我需要一个大容量的洗衣机,并且希望能有多种洗涤程序。 +销售员:我们的PQR型滚筒式洗衣机具备大容量和多种洗涤程序,可以满足您的需求。 + +17. +顾客:我正在考虑购买一个新的冰箱,有什么品牌和型号可以推荐吗? +销售员:冰箱有不同的容量、冷藏方式和附加功能,您对冷藏空间和节能性能有任何特殊要求吗? +顾客:我需要一个中等容量的冰箱,并且希望具备节能功能。 +销售员:我们的STU型冰箱具备中等容量和节能性能,可以满足您的需求。 + +18. +顾客:我想购买一个新的空调,有什么特点可以推荐吗? +销售员:空调有不同的制冷能力、能效等级和附加功能,您对房间面积和能效要求有任何特殊要求吗? +顾客:我需要一个适合中等大小房间的空调,并且具备高能效。 +销售员:我们的VWX型空调具备适中的制冷能力和高能效等级,适合中等大小房间使用。 + +19. +顾客:我正在寻找一个新的电视,有什么特点可以推荐吗? +销售员:电视有不同的屏幕尺寸、分辨率和智能功能,您对画质和连接选项有任何特殊要求吗? +顾客:我需要一个大屏幕的电视,并且希望具备高清晰度和智能功能。 +销售员:我们的YZA型电视具备大屏幕、高分辨率和智能功能,可以满足您的需求。 + +20. +顾客:我想购买一个新的电动扫地机器人,有什么品牌和型号可以推荐吗? +销售员:电动扫地机器人有不同的品牌和功能,您对清扫效果和智能导航有任何特殊要求吗? +顾客:我希望电动扫地机器人能够高效清扫并具备智能导航功能。 +销售员:我们的BCD型电动扫地机器人具备高效清扫和智能导航功能,可以满足您的需求。 + + +21. +顾客:我需要购买一个新的咖啡机,有什么类型可以推荐吗? +销售员:咖啡机有滴滤式、胶囊式和半自动式等不同类型,您对咖啡制作的便捷性和口味要求有任何特殊要求吗? +顾客:我希望咖啡机操作简单,而且能制作出浓郁的咖啡。 +销售员:我们的EFG型咖啡机具备简单的操作界面和浓郁的咖啡口味,可以满足您的需求。 + +22. +顾客:我正在考虑购买一个新的微波炉,有什么品牌和型号可以推荐吗? +销售员:微波炉有不同的容量、加热方式和预设程序,您对加热功率和多功能设置有任何特殊要求吗? +顾客:我需要一个中等容量的微波炉,并且希望具备多种预设程序。 +销售员:我们的HIJ型微波炉具备中等容量和多种预设程序,可以满足您的需求。 + +23. +顾客:我想购买一个新的电饭煲,有什么特点可以推荐吗? +销售员:电饭煲有不同的容量、煮饭方式和保温功能,您对煮饭量和保温时间有任何特殊要求吗? +顾客:我需要一个大容量的电饭煲,并且希望能有长时间的保温功能。 +销售员:我们的KLM型电饭煲具备大容量和长时间保温功能,可以满足您的需求。 + +24. +顾客:我正在寻找一个新的吸尘器,有什么特点可以推荐吗? +销售员:吸尘器有直立式、手持式和无线式等不同类型,您对吸力强度和清洁功能有任何特殊要求吗? +顾客:我希望吸尘器具备强大的吸力和多功能清洁附件。 +销售员:我们的NOP型吸尘器具备强大的吸力和多种清洁附件,可以满足您的需求。 + +25. +顾客:我想购买一个新的电动搬运车,有什么品牌和型号可以推荐吗? +销售员:电动搬运车有不同的承载能力和行驶方式,您对搬运重量和操控方式有任何特殊要求吗? +顾客:我需要一个能够承载较重物品的电动搬运车,并且希望操控简便。 +销售员:我们的QRS型电动搬运车具备较高的承载能力和简便的操控方式,可以满足您的需求。 + + +26. +顾客:我想购买一个新的电动剃须刀,有什么品牌和型号可以推荐吗? +销售员:电动剃须刀有旋转式和往复式两种类型,您对刮胡方式和剃须舒适度有任何特殊要求吗? +顾客:我希望电动剃须刀能够提供干湿两种剃须方式,并且剃须时舒适不刺激皮肤。 +销售员:我们的TUV型电动剃须刀支持干湿两种剃须方式,并且具备舒适的剃须体验,可以满足您的需求。 + +27. +顾客:我正在考虑购买一个新的音响系统,有什么品牌和型号可以推荐吗? +销售员:音响系统有立体声、环绕声和多房间音响等不同类型,您对音质表现和连接选项有任何特殊要求吗? +顾客:我希望音响系统能够提供优质的音质表现,并且支持多种音频输入选项。 +销售员:我们的WXY型音响系统具备优质的音质表现和多种音频输入选项,可以满足您的需求。 + +28. +顾客:我想购买一个新的电动牙刷,有什么特点可以推荐吗? +销售员:电动牙刷有旋转式和声波式两种类型,您对刷牙效果和刷头种类有任何特殊要求吗? +顾客:我希望电动牙刷能够提供彻底的清洁效果,并且有多种刷头可以选择。 +销售员:我们的XYZ型电动牙刷具备彻底的清洁效果和多种刷头选项,可以满足您的需求。 + +29. +顾客:我正在寻找一个新的电动按摩椅,有什么特点可以推荐吗? +销售员:电动按摩椅有不同的按摩方式和强度调节,您对按摩部位和按摩程序有任何特殊要求吗? +顾客:我希望电动按摩椅能够提供全身的按摩功能,并且有多种按摩程序可选。 +销售员:我们的ABC型电动按摩椅具备全身按摩功能和多种按摩程序,可以满足您的需求。 + +30. +顾客:我想购买一个新的电动剪发器,有什么品牌和型号可以推荐吗? +销售员:电动剪发器有不同的剪发长度和附件选项,您对剪发效果和剪发方式有任何特殊要求吗? +顾客:我希望电动剪发器能够提供精准的剪发效果,并且具备多种剪发方式可选。 +销售员:我们的DEF型电动剪发器具备精准的剪发效果和多种剪发方式,可以满足您的需求。 + + diff --git a/langchain/jupyter/homework/panxizhi/sales_chatbot/sales.ipynb b/langchain/jupyter/homework/panxizhi/sales_chatbot/sales.ipynb new file mode 100644 index 00000000..c0da1e52 --- /dev/null +++ b/langchain/jupyter/homework/panxizhi/sales_chatbot/sales.ipynb @@ -0,0 +1,914 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "69032b4b-b031-4020-beb0-e8dfd22b660e", + "metadata": {}, + "source": [ + "# LangChain 实战:房产销售聊天机器人" + ] + }, + { + "cell_type": "markdown", + "id": "0f3e2d5a-c2ff-4faf-b153-d1751c25fae0", + "metadata": {}, + "source": [ + "## 使用 GPT-4 构造销售话术数据\n", + "\n", + "ChatGPT 分享链接:https://chat.openai.com/share/f3e4b9b0-95fb-4c6a-a3c7-f901dd194c91\n", + "\n", + "\n", + "使用 ChatGPT 构造销售数据的 Prompt 示例:\n", + "\n", + "```\n", + "你是中国顶级的房地产销售,现在培训职场新人,请给出100条实用的销售话术。\n", + "\n", + "每条销售话术以如下格式给出:\n", + "[客户问题]\n", + "[销售回答]\n", + "\n", + "```\n", + "\n", + "GPT-4 回复结果:\n", + "\n", + "```\n", + "在房地产销售行业中,精心准备的销售话术可以极大地提高成交率。以下是一些适用于中国房地产销售的实用话术:\n", + "\n", + "1. \n", + "[客户问题] 这个小区交通便利吗?\n", + "[销售回答] 当然了,这个小区距离地铁站只有几分钟的步行距离,而且附近有多条公交线路,非常方便。\n", + "\n", + "2. \n", + "[客户问题] 我担心楼下太吵。\n", + "[销售回答] 这个小区特别注重居住体验,我们有良好的隔音设计,并且小区内部规划了绿化区域,可以有效降低噪音。\n", + "\n", + "3. \n", + "[客户问题] 我看房价还在涨,这个投资回报怎么样?\n", + "[销售回答] 这个区域未来有大量的商业和基础设施建设,所以从长期来看,投资回报非常有保证。\n", + "\n", + "4. \n", + "[客户问题] 有没有学校?\n", + "[销售回答] 附近有多所优质的学校,非常适合有孩子的家庭。\n", + "\n", + "5. \n", + "[客户问题] 物业管理怎么样?\n", + "[销售回答] 我们的物业管理得到了业主一致好评,服务非常到位。\n", + "\n", + "6. \n", + "[客户问题] 我想要南向的房子。\n", + "[销售回答] 很好,我们确实有一些朝南的单位,它们的采光特别好。\n", + "\n", + "7. \n", + "[客户问题] 这个小区安全吗?\n", + "[销售回答] 当然,我们24小时安保巡逻,还有先进的监控系统。\n", + "\n", + "8. \n", + "[客户问题] 预计什么时候交房?\n", + "[销售回答] 根据目前的进度,我们预计将在明年底交房。\n", + "\n", + "9. \n", + "[客户问题] 我不想要一楼的房子。\n", + "[销售回答] 我理解您的顾虑,我们还有多个楼层的房源可以选择。\n", + "\n", + "10. \n", + "[客户问题] 有优惠吗?\n", + "[销售回答] 当然,如果您现在下订,我们可以给您一些优惠。\n", + "```\n", + "\n", + "\n", + "## 使用 Document Transformers 模块来处理原始数据\n", + "\n", + "\n", + "将 ChatGPT 生成的结果保存到 [real_estate_sales_data.txt](real_estate_sales_data.txt) 文件中" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "efcee928-b9f4-4e4a-b9fb-f6cbee21b642", + "metadata": {}, + "outputs": [], + "source": [ + "with open(\"real_estate_sales_data.txt\") as f:\n", + " real_estate_sales = f.read()" + ] + }, + { + "cell_type": "markdown", + "id": "a9119556-d2e3-4edf-be16-15837da156f1", + "metadata": {}, + "source": [ + "### 使用 CharacterTextSplitter 来进行文本分割\n", + "\n", + "- 基于单字符来进行文本分割(separator)\n", + "- 基于字符数来决定文本块长度(chunk_size)\n", + "\n", + "参考示例:\n", + "\n", + "```python\n", + "from langchain.text_splitter import CharacterTextSplitter\n", + "text_splitter = CharacterTextSplitter( \n", + " separator = \"\\n\\n\",\n", + " chunk_size = 1000,\n", + " chunk_overlap = 200,\n", + " length_function = len,\n", + " is_separator_regex = False,\n", + ")\n", + "```\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "dbe535b8-2bd8-4c31-91d7-eba8aa61b3ef", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.text_splitter import CharacterTextSplitter" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "a4f30d9b-9fc6-4860-a497-0aa8220b9284", + "metadata": {}, + "outputs": [], + "source": [ + "text_splitter = CharacterTextSplitter( \n", + " separator = r'\\d+\\.',\n", + " chunk_size = 100,\n", + " chunk_overlap = 0,\n", + " length_function = len,\n", + " is_separator_regex = True,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "530f3172-17a5-4924-ad17-284fbca422ae", + "metadata": {}, + "outputs": [], + "source": [ + "docs = text_splitter.create_documents([real_estate_sales])" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "3956ec36-706e-4f43-88fc-d704f2984290", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Document(page_content='[客户问题] 这个小区交通便利吗?\\n[销售回答] 当然了,这个小区距离地铁站只有几分钟的步行距离,而且附近有多条公交线路,非常方便。', metadata={})" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "docs[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "73b2d47a-27ae-4cd4-a72a-3809b4c22bff", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "70" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(docs)" + ] + }, + { + "cell_type": "markdown", + "id": "e4617858-ad9c-42eb-a4c3-6bfb7108cdd2", + "metadata": {}, + "source": [ + "### 使用 Faiss 作为向量数据库,持久化存储房产销售 问答对(QA-Pair)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "508c2ab2-e187-424f-a43c-ca3d3b1a0550", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.embeddings.openai import OpenAIEmbeddings\n", + "from langchain.text_splitter import CharacterTextSplitter\n", + "from langchain.vectorstores import FAISS\n", + "\n", + "db = FAISS.from_documents(docs, OpenAIEmbeddings())" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "250e0873-95e2-4372-98a5-68c0feb304bb", + "metadata": {}, + "outputs": [], + "source": [ + "query = \"小区吵不吵\"" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "0d18fcc6-c2be-434b-b8de-9648ac6f83f4", + "metadata": {}, + "outputs": [], + "source": [ + "answer_list = db.similarity_search(query)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "e689667e-7a45-40d6-9eae-4e52cbc7daab", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[客户问题] 我担心楼下太吵。\n", + "[销售回答] 这个小区特别注重居住体验,我们有良好的隔音设计,并且小区内部规划了绿化区域,可以有效降低噪音。\n", + "\n", + "[客户问题] 我担心楼下的商业活动会很吵。\n", + "[销售回答] 我们在规划时就已经考虑到这一点,商业区和居住区有一定的距离和隔音设计。\n", + "\n", + "[客户问题] 我喜欢安静,这里噪音大吗?\n", + "[销售回答] 我们特意进行了隔音设计,并且小区内部也有绿化带,整体非常安静。\n", + "\n", + "[客户问题] 我担心小区会很拥挤。\n", + "[销售回答] 这个小区总体规划非常合理,保证了每个单元之间有足够的空间。\n", + "\n" + ] + } + ], + "source": [ + "for ans in answer_list:\n", + " print(ans.page_content + \"\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "a6cc47bf-1566-472f-84d2-a46bd634907c", + "metadata": {}, + "outputs": [], + "source": [ + "db.save_local(\"real_estates_sale\")" + ] + }, + { + "cell_type": "markdown", + "id": "0d5323e5-03fa-4c1c-b896-3ccc28af1857", + "metadata": {}, + "source": [ + "### 使用 retriever 从向量数据库中获取结果\n", + "\n", + "#### 使用参数 `k` 指定返回结果数量\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "17f051c9-8676-48d1-a9a4-e69b52630faf", + "metadata": {}, + "outputs": [], + "source": [ + "topK_retriever = db.as_retriever(search_kwargs={\"k\": 3})" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "e33024db-ca7b-4d68-9b0e-c31c15ded57d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "VectorStoreRetriever(tags=['FAISS'], metadata=None, vectorstore=, search_type='similarity', search_kwargs={'k': 3})" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "topK_retriever" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "7efb1414-859f-4818-bacd-e1e2edf34b95", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[客户问题] 我担心楼下太吵。\n", + "[销售回答] 这个小区特别注重居住体验,我们有良好的隔音设计,并且小区内部规划了绿化区域,可以有效降低噪音。\n", + "\n", + "[客户问题] 我担心楼下的商业活动会很吵。\n", + "[销售回答] 我们在规划时就已经考虑到这一点,商业区和居住区有一定的距离和隔音设计。\n", + "\n", + "[客户问题] 我喜欢安静,这里噪音大吗?\n", + "[销售回答] 我们特意进行了隔音设计,并且小区内部也有绿化带,整体非常安静。\n", + "\n" + ] + } + ], + "source": [ + "docs = topK_retriever.get_relevant_documents(query)\n", + "for doc in docs:\n", + " print(doc.page_content + \"\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "a3c4b62d-6f8f-461b-a4fb-41c8a404b831", + "metadata": {}, + "outputs": [], + "source": [ + "docs = topK_retriever.get_relevant_documents(\"你们有没有1000万的豪宅啊?\")" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "04cca3d7-23d9-491f-8350-faeb19861dec", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[客户问题] 你们会提供家具吗?\n", + "[销售回答] 我们的精装房会提供基础家具和家电,让您拎包入住。\n", + "\n", + "[客户问题] 都有哪些户型?\n", + "[销售回答] 我们有从一室到四室不等的多种户型,定能满足您不同的居住需求。\n", + "\n", + "[客户问题] 我不想要一楼的房子。\n", + "[销售回答] 我理解您的顾虑,我们还有多个楼层的房源可以选择。\n", + "\n" + ] + } + ], + "source": [ + "for doc in docs:\n", + " print(doc.page_content + \"\\n\")" + ] + }, + { + "cell_type": "markdown", + "id": "b335c9e5-bc01-4a47-916c-03bec0e7a839", + "metadata": {}, + "source": [ + "#### 使用 similarity_score_threshold 设置阈值,提升结果的相关性质量" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "b1298956-6cf3-4a68-a3c2-a3149f64f156", + "metadata": {}, + "outputs": [], + "source": [ + "retriever = db.as_retriever(\n", + " search_type=\"similarity_score_threshold\",\n", + " search_kwargs={\"score_threshold\": 0.8}\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "d61167de-72ed-4618-a2e1-5df04784a3bc", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[客户问题] 我担心楼下太吵。\n", + "[销售回答] 这个小区特别注重居住体验,我们有良好的隔音设计,并且小区内部规划了绿化区域,可以有效降低噪音。\n", + "\n" + ] + } + ], + "source": [ + "docs = retriever.get_relevant_documents(query)\n", + "for doc in docs:\n", + " print(doc.page_content + \"\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "548d1b52-6fa2-46d6-9218-908db572f380", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "c31344ce-f952-4604-b700-fdbb34da1e28", + "metadata": {}, + "source": [ + "### 提取向量数据库中的`销售回答`" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "25768463-da65-4762-978d-51c0c3a9c24b", + "metadata": {}, + "outputs": [], + "source": [ + "docs = retriever.get_relevant_documents(query)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "4838d3da-dca3-4c31-80da-cec75760a833", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'[客户问题] 我担心楼下太吵。\\n[销售回答] 这个小区特别注重居住体验,我们有良好的隔音设计,并且小区内部规划了绿化区域,可以有效降低噪音。'" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "docs[0].page_content" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "1ba30232-f468-4102-b70c-02c8b74da43d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['[客户问题] 我担心楼下太吵。\\n', '这个小区特别注重居住体验,我们有良好的隔音设计,并且小区内部规划了绿化区域,可以有效降低噪音。']" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "docs[0].page_content.split(\"[销售回答] \")" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "bda8eb0c-eb1f-47b2-91ab-4c94ff71c3db", + "metadata": {}, + "outputs": [], + "source": [ + "ans = docs[0].page_content.split(\"[销售回答] \")[-1]" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "917a3b6c-3149-42e2-b7b6-abebd94b7f72", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'这个小区特别注重居住体验,我们有良好的隔音设计,并且小区内部规划了绿化区域,可以有效降低噪音。'" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ans" + ] + }, + { + "cell_type": "markdown", + "id": "b36b61a5-156d-4546-bbd1-8a983e94676b", + "metadata": {}, + "source": [ + "#### 尝试各种问题" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "c311a656-e571-45d6-8690-88e4be2c2a65", + "metadata": {}, + "outputs": [], + "source": [ + "from typing import List\n", + "\n", + "def sales(query: str, score_threshold: float=0.8) -> List[str]:\n", + " retriever = db.as_retriever(search_type=\"similarity_score_threshold\", search_kwargs={\"score_threshold\": score_threshold}) \n", + " docs = retriever.get_relevant_documents(query)\n", + " ans_list = [doc.page_content.split(\"[销售回答] \")[-1] for doc in docs]\n", + "\n", + " return ans_list" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "2f4c6c08-b97c-4727-9aa1-a474d85a11e4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/root/miniconda3/envs/langchain/lib/python3.10/site-packages/langchain/vectorstores/base.py:266: UserWarning: No relevant docs were retrieved using the relevance score threshold 0.8\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "query = \"我想离医院近点\"\n", + "\n", + "print(sales(query))" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "11e3d8b6-839f-4b2f-b519-841e0271f95b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['有的,距离我们小区不远就有几家大型综合医院。', '是的,附近有多家大型医院,医疗资源非常丰富。']\n" + ] + } + ], + "source": [ + "print(sales(query, 0.75))" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "fc5b030a-ae6f-4d42-9577-1420c78aecb1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "score:0.8 ans: []\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/root/miniconda3/envs/langchain/lib/python3.10/site-packages/langchain/vectorstores/base.py:266: UserWarning: No relevant docs were retrieved using the relevance score threshold 0.75\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "score:0.75 ans: []\n", + "\n", + "score:0.5 ans: ['我们有不同户型和付款方案,一定有适合您预算的。', '我们的房子位于黄金地段,升值潜力很大,转卖起来不会有问题。', '定金金额取决于您选择的房型和付款方式,我们可以详细为您解释。', '这个区域正在快速发展,未来的升值潜力非常大。']\n", + "\n" + ] + } + ], + "source": [ + "query = \"价格200万以内\"\n", + "\n", + "print(f\"score:0.8 ans: {sales(query)}\\n\")\n", + "print(f\"score:0.75 ans: {sales(query, 0.75)}\\n\")\n", + "print(f\"score:0.5 ans: {sales(query, 0.5)}\\n\")" + ] + }, + { + "cell_type": "markdown", + "id": "9e26e6d7-259b-4ecf-9c72-027befba4e3b", + "metadata": {}, + "source": [ + "#### 当向量数据库中没有合适答案时,使用大语言模型能力" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "9142feb4-980a-4142-b367-1401021dceef", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.chains import RetrievalQA\n", + "from langchain.chat_models import ChatOpenAI\n", + "\n", + "llm = ChatOpenAI(model_name=\"gpt-3.5-turbo\", temperature=0)\n", + "qa_chain = RetrievalQA.from_chain_type(llm,\n", + " retriever=db.as_retriever(search_type=\"similarity_score_threshold\",\n", + " search_kwargs={\"score_threshold\": 0.8}))" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "d281b560-54bb-4a25-a1dc-b23874361654", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'query': '价格200万以内', 'result': '请问您是在询问什么产品或服务的价格?'}" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "qa_chain({\"query\": query})" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "c6a30dcc-fa6b-4282-935c-b07902746e19", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'query': '小区吵不吵',\n", + " 'result': '这个小区特别注重居住体验,我们有良好的隔音设计,并且小区内部规划了绿化区域,可以有效降低噪音。所以一般情况下,小区内部的噪音会比较低。'}" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "qa_chain({\"query\": \"小区吵不吵\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "9da44bd6-02ef-4fb6-8e9e-7c99aeba483a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['这个小区特别注重居住体验,我们有良好的隔音设计,并且小区内部规划了绿化区域,可以有效降低噪音。']\n" + ] + } + ], + "source": [ + "print(sales(\"小区吵不吵\"))" + ] + }, + { + "cell_type": "markdown", + "id": "0a0647e8-5b41-4c93-87c2-3240ec558717", + "metadata": {}, + "source": [ + "## 加载 FAISS 向量数据库已有结果" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "913edd04-37ea-4c3f-9346-086aeb0ab447", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.embeddings.openai import OpenAIEmbeddings\n", + "from langchain.vectorstores import FAISS\n", + "\n", + "db = FAISS.load_local(\"real_states_sale\", OpenAIEmbeddings())" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "77d8eee4-6383-4b4a-9737-95ea9fc53b87", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.chains import RetrievalQA\n", + "from langchain.chat_models import ChatOpenAI\n", + "\n", + "llm = ChatOpenAI(model_name=\"gpt-3.5-turbo\", temperature=0)\n", + "qa_chain = RetrievalQA.from_chain_type(llm,\n", + " retriever=db.as_retriever(search_type=\"similarity_score_threshold\",\n", + " search_kwargs={\"score_threshold\": 0.8}))" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "5dfac197-cadb-47d7-8cbe-1b07a83fc8e1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'query': '我想买别墅,你们有么',\n", + " 'result': '很抱歉,我是一个AI助手,无法提供实体房产的购买信息。建议您联系房地产中介或者在房产网站上搜索别墅的相关信息。'}" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "qa_chain({\"query\": \"我想买别墅,你们有么\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "fc393afc-06d3-4483-bd24-efe000ef5f2a", + "metadata": {}, + "outputs": [], + "source": [ + "# 输出内部 Chain 的日志\n", + "qa_chain.combine_documents_chain.verbose = True" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "013c313a-5e7c-48d1-8ed4-eee96e50a99c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\u001b[1m> Entering new StuffDocumentsChain chain...\u001b[0m\n", + "\n", + "\u001b[1m> Finished chain.\u001b[0m\n" + ] + }, + { + "data": { + "text/plain": [ + "{'query': '我想买别墅,你们有么',\n", + " 'result': '很抱歉,我是一个AI助手,无法提供房地产销售服务。建议您联系当地的房地产中介或者通过房地产网站来寻找别墅的购买信息。'}" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "qa_chain({\"query\": \"我想买别墅,你们有么\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "e5fb0943-ffe9-4270-8dbe-f5f5314e6042", + "metadata": {}, + "outputs": [], + "source": [ + "# 返回向量数据库的检索结果\n", + "qa_chain.return_source_documents = True" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "fbc1741e-d34f-4df5-874b-02b78e8cd67a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\u001b[1m> Entering new StuffDocumentsChain chain...\u001b[0m\n", + "\n", + "\u001b[1m> Finished chain.\u001b[0m\n" + ] + } + ], + "source": [ + "result = qa_chain({\"query\": \"我想买别墅,你们有么\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "fac8309c-033c-4144-ada0-08e74fa9bf2d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'query': '我想买别墅,你们有么',\n", + " 'result': '很抱歉,我是一个AI助手,无法提供实际的房产销售信息。建议您联系当地的房地产中介或者在房产网站上搜索别墅的销售信息。',\n", + " 'source_documents': []}" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "37de84df-999f-4469-9bbe-de9c95c6f6b3", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/langchain/jupyter/homework/panxizhi/sales_chatbot/sales_chatbot.py b/langchain/jupyter/homework/panxizhi/sales_chatbot/sales_chatbot.py new file mode 100644 index 00000000..11d5d547 --- /dev/null +++ b/langchain/jupyter/homework/panxizhi/sales_chatbot/sales_chatbot.py @@ -0,0 +1,86 @@ +import gradio as gr +import random +import time +from gradio.events import ( + Changeable, + EventListenerMethod, + Focusable, + Inputable, + Selectable, +) + +from typing import List + +from langchain.embeddings.openai import OpenAIEmbeddings +from langchain.vectorstores import FAISS +from langchain.chains import RetrievalQA +from langchain.chat_models import ChatOpenAI + +from create_sales_vector_dbs import load_data_to_vectordb + +sales_db = { + "房产": "real_estate_sales_data", + "电器": "sales_eletronic_stuffs" +} + +def reload_sales_bot(sales_type="房产"): + print(f"[sales_type]{sales_type}") + global SALES_BOT + SALES_BOT = initialize_sales_bot(sales_db[sales_type]) + +def initialize_sales_bot(vector_store_dir: str="real_estate_sales_data"): + db = FAISS.load_local(vector_store_dir, OpenAIEmbeddings()) + llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0) + + global SALES_BOT + SALES_BOT = RetrievalQA.from_chain_type(llm, + retriever=db.as_retriever(search_type="similarity_score_threshold", + search_kwargs={"score_threshold": 0.8})) + # 返回向量数据库的检索结果 + SALES_BOT.return_source_documents = True + + return SALES_BOT + +def sales_chat(message, history, sales_type="房产"): + print(f"[message]{message}") + print(f"[history]{history}") + print(f'''[sales_type]{sales_type}''') + # TODO: 从命令行参数中获取 + enable_chat = True + + ans = SALES_BOT({"query": message}) + # 如果检索出结果,或者开了大模型聊天模式 + # 返回 RetrievalQA combine_documents_chain 整合的结果 + if ans["source_documents"] or enable_chat: + print(f"[result]{ans['result']}") + print(f"[source_documents]{ans['source_documents']}") + return ans["result"] + # 否则输出套路话术 + else: + return "这个问题我要问问领导" + + +def launch_gradio(): + global salesDemo + salesType = gr.Dropdown(label="产品类型", choices=["房产","电器"], value="房产") + + with gr.ChatInterface(fn = sales_chat, + title = "销售助理", + additional_inputs = [ + salesType + ], + additional_inputs_accordion_name="更多选项", + chatbot=gr.Chatbot(height=600)) as demo: + + salesType.change(reload_sales_bot, salesType) + + demo.launch(share=True, server_name="0.0.0.0") + +if __name__ == "__main__": + ##create dbs + load_data_to_vectordb("data/sales_eletronic_stuffs.txt", "sales_eletronic_stuffs") + load_data_to_vectordb("data/real_estate_sales_data.txt", "real_estate_sales_data") + # 初始化房产销售机器人 + initialize_sales_bot() + # 启动 Gradio 服务 + launch_gradio()